Model typu filtrowanie kolaboracyjne (collaborative filtering).

Wykorzystujemy w tym celu macierz interakcji, którą następnie dekomponujemy na macierze mniejszej wymiarowości z użyciem 
TruncatedSVD - sklearn. 

In [1]:
import pandas as pd
import numpy as np

Przygotowanie macierzy interakcji

In [2]:
sessionsDataPath = '../notebooks/data/v2/sessions.jsonl'
sessionsDF = pd.read_json(sessionsDataPath, lines=True)

df = sessionsDF.drop(columns=["session_id", "timestamp", "event_type", "offered_discount", "purchase_id"])
df["count"] = 1
interactionMatrixDF = pd.pivot_table(df, index="user_id", columns="product_id", values="count", aggfunc=np.sum, fill_value=0)

Przygotowanie zbirów danych: treningowy i testowy

In [3]:
from sklearn.model_selection import train_test_split

trainUser, testUser = train_test_split(interactionMatrixDF, test_size=0.2)

#transpose because items are columns
trainItem, testItem = train_test_split(interactionMatrixDF.transpose(), test_size=0.2)

Dekompozycja utworzonej macierzy na podmacierze ze względu na użytkowników i produkty.

In [31]:
from sklearn.decomposition import TruncatedSVD

#initial hiperparameters
epsilon = 1e-9
latentFactors = 10

# #generate item latent features
# itemSVD = TruncatedSVD(n_components=latentFactors)
# itemFeatures = itemSVD.fit_transform(interactionMatrixDF.transpose()) + epsilon #transpose because items are columns

# #generate user latent features
# itemSVD = TruncatedSVD(n_components=latentFactors)
# itemFeatures = itemSVD.fit_transform(interactionMatrixDF) + epsilon

#generate item latent features
itemSVD = TruncatedSVD(n_components=latentFactors)
itemFeatures = itemSVD.fit_transform(trainItem) + epsilon #transpose because items are columns

#generate user latent features
userSVD = TruncatedSVD(n_components=latentFactors)
userFeatures = userSVD.fit_transform(trainUser) + epsilon


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity to determine k_closest

IndexError: index 1003 is out of bounds for axis 0 with size 255

In [30]:
ratings = sessionsDF.drop(columns=["session_id", "timestamp", "event_type", "offered_discount", "purchase_id"])
ratings = ratings.groupby(['user_id', 'product_id']).size().reset_index().rename(columns={0:''})

ratings["ufsID"] = ratings.index
ratings["mfsID"] = ratings.index

ratings

Unnamed: 0,user_id,product_id,Unnamed: 3,ufsID,mfsID
0,102,1003,1,0,0
1,102,1011,1,1,1
2,102,1016,1,2,2
3,102,1017,3,3,3
4,102,1025,1,4,4
...,...,...,...,...,...
28611,301,1311,2,28611,28611
28612,301,1313,1,28612,28612
28613,301,1316,6,28613,28613
28614,301,1317,1,28614,28614


In [28]:
columns = ["uf{0}".format(i+1) for i in range(10)]
ufs = pd.DataFrame(userFeatures, columns = columns)
ufs["ufsID"] = ufs.index
print ("len(ufs) = {0}".format(len(ufs)))

columns = ["mf{0}".format(i+1) for i in range(10)]
mfs = pd.DataFrame(itemFeatures, columns = columns)
mfs["mfsID"] = mfs.index
print ("len(ufs) = {0}".format(len(ufs)))

train_data = ratings.merge(ufs, on="ufsID") \
    .merge(mfs, on="mfsID") \
    .drop(["user_id", "product_id", "ufsID", "mfsID"], axis = 1)

print ("len(train_data) = {0}".format(len(train_data)))

len(ufs) = 160
len(ufs) = 160
len(train_data) = 160


Unnamed: 0,Unnamed: 1,uf1,uf2,uf3,uf4,uf5,uf6,uf7,uf8,uf9,...,mf1,mf2,mf3,mf4,mf5,mf6,mf7,mf8,mf9,mf10
0,1,199.543257,-23.330560,-21.233795,3.330132,-1.028866,16.754852,3.433408,2.871770,1.663539,...,5.692295,-0.408956,0.155072,-0.067089,0.328767,0.205415,0.131296,1.510729,-0.561904,0.554478
1,1,155.534252,10.764370,6.114581,-2.699965,-1.340198,15.795605,4.358042,-5.457087,0.208094,...,4.849667,-1.093860,-0.157617,1.166444,0.856959,-0.974188,-0.384319,-0.340718,0.414821,0.825122
2,1,74.742303,6.374299,4.068879,-6.891983,1.814685,-1.895047,6.438130,-3.139426,6.662955,...,8.188460,0.483631,-0.621060,0.208704,1.918879,2.298942,1.013611,-1.466455,-0.108627,0.049375
3,3,67.651871,-3.798767,5.615599,0.007930,-5.227762,-9.331811,-0.476812,-3.206318,-7.805843,...,35.927682,-4.183628,-5.272029,-4.404437,0.035544,-1.191962,1.661562,1.420179,3.248011,-0.261175
4,1,164.099044,3.475583,-4.228687,-9.124813,-9.062320,3.605570,-5.855755,-8.673315,2.442282,...,10.483318,0.357274,-0.946620,-0.183671,-0.360407,-0.651632,-1.241364,0.477459,1.306849,-0.896322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,1,84.011089,-5.855143,-0.400111,-2.647781,4.518670,-1.748874,0.991506,-0.578633,-6.388405,...,5.886776,-0.687738,0.978473,-0.644110,0.577532,1.414506,-0.930597,1.766562,-0.513144,-1.121461
156,1,12.979260,-8.549300,-2.051509,-7.666712,6.476556,3.412191,-3.379527,0.923488,-0.226172,...,14.115572,-0.476526,-1.245069,-2.214343,-0.058170,-2.031694,0.756545,0.750173,0.640934,-1.108579
157,2,13.251521,6.100340,-0.863988,-0.242412,-2.334225,1.927711,0.692275,-0.249530,2.443958,...,18.025891,-0.742360,-0.455072,-2.694743,-0.166004,-2.362558,-4.068447,2.450823,1.626727,-0.186661
158,1,106.961181,12.228391,-10.454493,-11.066105,-15.576116,-1.061740,-9.595613,-8.476215,-4.500403,...,4.286804,-0.134194,-0.260442,0.268095,0.291654,0.163613,-0.045034,0.104334,-0.320035,0.007654


In [None]:
import numpy as np

targets = np.array(train_data.Rating)
data = np.array(train_data.drop("Rating", axis = 1))

print ("targets.shape = {0}".format(targets.shape))
print ("data.shape = {0}".format(data.shape))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import math

regressor = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, verbose=1)
regressor.fit(data, targets)

print (math.sqrt(mean_squared_error( regressor.predict(data), targets )))