Model typu filtrowanie kolaboracyjne (collaborative filtering).

Wykorzystujemy w tym celu macierz interakcji, którą następnie dekomponujemy na macierze mniejszej wymiarowości z użyciem 
TruncatedSVD - sklearn. 

In [75]:
import pandas as pd
import numpy as np

Przygotowanie macierzy interakcji

In [76]:
sessionsDataPath = '../notebooks/data/v2/sessions.jsonl'
productsDataPath = '../notebooks/data/v2/products.jsonl'
sessionsDF = pd.read_json(sessionsDataPath, lines=True)
productsDF = pd.read_json(productsDataPath, lines=True)

df = sessionsDF.drop(columns=["session_id", "timestamp", "event_type", "offered_discount", "purchase_id"])
df["count"] = 1
interactionMatrixDF = pd.pivot_table(df, index="user_id", columns="product_id", values="count", aggfunc=np.sum, fill_value=0)

In [77]:
idNameDict = pd.Series(productsDF["product_name"].values, index=productsDF["product_id"]).to_dict()

Przygotowanie zbirów danych: treningowy i testowy

In [78]:
from sklearn.model_selection import train_test_split

trainUser, testUser = train_test_split(interactionMatrixDF, test_size=0.2)

#transpose because items are columns
# trainItem, testItem = train_test_split(interactionMatrixDF.transpose(), test_size=0.2)
trainItem = interactionMatrixDF.transpose() #for testing only

Dekompozycja utworzonej macierzy na podmacierze ze względu na użytkowników i produkty.

In [79]:
from sklearn.decomposition import TruncatedSVD

#initial hiperparameters
epsilon = 1e-9
latentFactors = 100

# #generate item latent features
# itemSVD = TruncatedSVD(n_components=latentFactors)
# itemFeatures = itemSVD.fit_transform(interactionMatrixDF.transpose()) + epsilon #transpose because items are columns

# #generate user latent features
# itemSVD = TruncatedSVD(n_components=latentFactors)
# itemFeatures = itemSVD.fit_transform(interactionMatrixDF) + epsilon

#generate item latent features
itemSVD = TruncatedSVD(n_components=latentFactors)
itemFeatures = itemSVD.fit_transform(trainItem) + epsilon #transpose because items are columns

#generate user latent features
userSVD = TruncatedSVD(n_components=latentFactors)
userFeatures = userSVD.fit_transform(trainUser) + epsilon

pd.DataFrame(itemFeatures)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,14.704323,1.597092,-2.165458,0.047965,2.275149,-2.797011,0.138227,0.500123,-0.352298,-1.282725,...,-0.074418,1.098574,-0.199556,0.238595,-0.165700,0.321518,0.019216,-0.852955,1.155397,0.501658
1,16.788978,-0.275855,1.383865,1.826558,-1.832014,1.404619,1.818640,-2.064323,-1.060061,2.170347,...,-0.511254,-0.967984,-1.394140,0.564089,-0.299133,0.952718,1.435348,1.609668,-0.253237,0.241973
2,66.223727,-0.556897,4.184756,5.314697,-12.430235,10.987846,-1.421603,-4.418558,1.859620,4.595787,...,-1.133063,0.398340,-0.419478,0.288090,-0.823101,-0.670668,-1.159607,-0.517723,-0.232200,-0.132287
3,128.537144,-10.740741,-3.462499,21.309936,-19.987400,-19.398645,40.536204,-3.041035,2.777062,-3.151862,...,0.129221,0.384702,0.474152,0.269469,0.205782,0.359057,0.535370,-0.345929,-0.388885,0.395149
4,63.634981,1.502392,11.307903,1.774280,-8.034810,-0.799897,1.253779,-8.517540,7.457859,4.976982,...,0.787371,0.465223,-0.378295,2.264147,-0.272924,-0.050256,-0.408180,0.960744,0.048251,-0.330870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,14.020710,1.664459,2.984921,0.304660,-1.173022,2.242046,0.607685,0.652815,0.086635,0.221830,...,-0.071134,-0.255041,-0.738858,-0.941469,0.339710,-0.530099,-0.445373,1.493473,-0.058171,-0.483724
315,111.686745,3.768304,26.874828,18.574362,-12.978281,18.501830,-17.091939,-7.823099,34.528056,12.904946,...,-0.113683,-0.192874,0.998975,0.020707,0.795301,-0.232759,0.082097,-0.658010,-0.060240,-0.009476
316,25.998551,0.765972,3.612044,3.521811,-2.863661,4.616728,-4.418926,-2.120802,3.489250,-2.094882,...,-0.461607,0.431877,-0.203478,-0.818006,-0.540384,-0.228351,0.268724,-0.104782,0.272668,1.365027
317,16.738593,2.004886,0.009711,0.159582,2.095635,2.024905,0.077352,-2.708347,0.000116,-3.521644,...,-0.240795,-0.421205,-0.531487,-0.464032,-0.070667,0.085670,0.559168,1.135477,0.612189,0.834161


In [80]:
def top_k(item_id, top_k, corr_mat, map_name):
    topItems = corr_mat[item_id,:].argsort()[-top_k:][::-1]
    # topItems = [map_name[e] for e in topItems]
    return topItems

In [81]:
from sklearn.metrics.pairwise import cosine_similarity

itemCorrMat = cosine_similarity(itemFeatures)

#before test_split it is mandatory to create dict mapping indexes of products in productsDF to labels or ids
#because truncatedSVD has rows coresponding to rows in productsDF, but after split there not the same
recommendations = top_k(7, 10, itemCorrMat, idNameDict)
display(productsDF.iloc[recommendations,:])


Unnamed: 0,product_id,product_name,category_path,price,user_rating
7,1008,Tom Clancy&#39;s Rainbow Six Vegas (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99,3.715598
11,1012,Fallout New Vegas (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,69.0,2.386605
12,1013,LA Noire (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,129.99,0.16666
5,1006,Call of Duty 4 Modern Warfare (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,59.9,3.05427
52,1053,Anno 2070 (PC),Gry i konsole;Gry komputerowe,42.9,4.378637
53,1054,Call of Duty 2 (PC),Gry i konsole;Gry komputerowe,32.99,4.628316
47,1048,Gra o tron (PC),Gry i konsole;Gry komputerowe,63.49,2.346389
10,1011,BioShock Infinite (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,139.99,3.251818
49,1050,Bioshock 2 (PC),Gry i konsole;Gry komputerowe,37.9,4.959925
48,1049,Max Payne 3 (PC),Gry i konsole;Gry komputerowe,17.9,1.495826
