# Item-item collaborative filtering

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy 

from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
from pyspark.ml.recommendation import ALS
from pyspark.sql import functions as F

In [17]:
data = pd.read_csv("/home/romain/Documents/PhD/logisticDPP/BelgiumRetail/data",
                   header=None,names=['itemSet'])

sets = list(map(lambda x: x.split(' ')[:-1],data['itemSet']))
sets = list(map(lambda x: list(map(lambda y: int(y),x)),sets))
size = list(map(len,sets))
nUsers = len(sets)
users = np.repeat(range(nUsers),size)

flatsets = [int(item) for sublist in sets for item in sublist]
df = pd.DataFrame({'user':users,'item':flatsets,'rating':1.0})
#df.index = df['user']
nItem = len(set(df['item']))

In [4]:
def random_mask(x):
    result = np.zeros_like(x)
    if len(x)>1:
        result[np.random.choice(len(x))] = 1
    return result

threshold   = 0.7

In [5]:
def recommendForBasket(basket,V):
    sim = np.zeros(len(V))
    for item in basket:
        tmp = V[item,:].dot(V.T)
        sim += tmp/len(basket)
    sim[basket] = np.min(sim)-10
    return sim

In [21]:
sc.setCheckpointDir("checkpoints")
for numTraits in [5,10,20,30,50]:
    print("Start numTraits=",numTraits)
    
    nRuns       = 3
    MPR         = []
    Ks          = [5,10,20]
    P           = dict.fromkeys(Ks)
    for K in Ks:
        P[K] = []

    for run in range(nRuns):
        print("run number",run+1,"-",numTraits)
        np.random.seed(123*run)

        testUsers = list(np.random.choice(range(nUsers),size=int((1-threshold)*nUsers),replace=False))
        trainingUsers = list(set(range(nUsers))-set(testUsers))
        trainingData = df.loc[trainingUsers]
        trainingData.index = range(len(trainingData))

        testData = df.loc[testUsers]
        testData.index = range(len(testData))

        mask = testData.groupby(['user'])['user'].transform(random_mask).astype(bool)
        not_mask = list(map(lambda x: not(x),mask))
        
        testUsersBasket = testData.loc[not_mask]
        #trainingData = pd.concat([trainingData,testData.loc[not_mask]])
        testData = testData.loc[mask]

        sparkInput = sqlCtx.createDataFrame(trainingData)
        als = ALS(rank=numTraits,regParam=0.1,userCol='user',itemCol='item',ratingCol='rating',implicitPrefs=True)
        mod = als.fit(sparkInput) 

        # get item latent factors
        V = mod.itemFactors.orderBy("id")
        V_index = V.select('id').toPandas()
        V = V.select('features')

        for k in range(numTraits):
            V = V.withColumn('factor'+str(k),V.features[k])

        V = V.drop('features')
        V = V.toPandas()
        V.index = V_index['id']
        unknowns = list(set(range(nItem))-set(V_index['id']))
        for unknown in unknowns:
            V.loc[unknown] = 0
        V = V.sort_index()
        V = np.array(V)

        percentileRank = []
        precisionAt5 = 0
        precisionAt10 = 0
        precisionAt20 = 0

        for user in testUsers:
            if len(testData.loc[testData['user']==user,'item'])>0:
                basket = list(testUsersBasket.loc[testUsersBasket['user']==user,'item'])
                true_target = list(testData.loc[testData['user']==user,'item'])[0]
                subY = recommendForBasket(basket,V)
                y0 = subY[true_target]
                rank = np.sum(subY>y0)
                percentileRank.append(1-rank/nItem)
                top5Target = np.argsort(subY)[-5:]
                top10Target = np.argsort(subY)[-10:]
                top20Target = np.argsort(subY)[-20:]
                if true_target in top5Target:
                    precisionAt5 += 1
                if true_target in top10Target:
                    precisionAt10 += 1
                if true_target in top20Target:
                    precisionAt20 += 1

        MPR.append(100*np.mean(percentileRank))
        P[5].append(100*precisionAt5/len(percentileRank))
        P[10].append(100*precisionAt10/len(percentileRank))
        P[20].append(100*precisionAt20/len(percentileRank))
    
    print("\n")
    print("*"*20)
    print("num latent factors:",numTraits)
    print("MPR=",np.mean(MPR))
    for K in Ks:
        print("Precision @"+str(K)+"=",np.mean(P[K]))
    print("*"*20)

Start numTraits= 5
run number 1 - 5
run number 2 - 5
run number 3 - 5


********************
num latent factors: 5
MPR= 90.02628341184864
Precision @5= 16.804923929214684
Precision @10= 19.732975244276034
Precision @20= 23.62961628300785
********************
Start numTraits= 10
run number 1 - 10
run number 2 - 10
run number 3 - 10


********************
num latent factors: 10
MPR= 89.69686520590899
Precision @5= 16.017428649634137
Precision @10= 18.885731055540997
Precision @20= 22.945780337170998
********************
Start numTraits= 20
run number 1 - 20
run number 2 - 20
run number 3 - 20


********************
num latent factors: 20
MPR= 88.29554948968574
Precision @5= 16.21708287411146
Precision @10= 18.762730589254414
Precision @20= 22.23227625132314
********************
Start numTraits= 30
run number 1 - 30
run number 2 - 30
run number 3 - 30


********************
num latent factors: 30
MPR= 86.76722565458
Precision @5= 15.941925094794144
Precision @10= 18.685821184430246
Precis