In [24]:
from surprise import SVD, NMF , NormalPredictor
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate,train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from evaluation_metrics import precision_recall_at_k
import pandas as pd
import pickle

Matrix Factorization - user - item interactions
*   Loading the user_item_rating dataset that I have created in the last stage.
*   Using Random predictor as a baseline algorithem for recommendation of items to user.
*   Applying two algorithms of MF - SVD
*   MF algorithms aim to estimate/optimize the matrix U and V that will get minimum loss of the know rating of the user-items
*   Given U and V we can predict the unknown rating of all users and items.
*   The results of the algorithem will be validated by using RMSE score.


In [60]:
user_item_rating = pd.read_csv('../data/user_item_rating.csv')

Using MF algorithems and prform cross validation on the dataset.

In [11]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_item_rating[['user_id', 'item_id', 'rating']], reader)
svd_algo = SVD()
nmf_algo = NMF()
algos= [('SVD',svd_algo),('NMF',nmf_algo),('Random',NormalPredictor())]
for algo in algos :
    print(algo[0])
    cross_validate(algo[1], data, measures=['RMSE'], cv=3, verbose=True)

SVD
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0672  1.0677  1.0723  1.0690  0.0023  
Fit time          3.34    3.51    3.60    3.48    0.11    
Test time         0.24    0.26    0.19    0.23    0.03    
NMF
Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2997  1.2935  1.2954  1.2962  0.0026  
Fit time          5.05    5.05    5.14    5.08    0.04    
Test time         0.22    0.22    0.22    0.22    0.00    
Random
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4115  1.4226  1.4143  1.4161  0.0047  
Fit time          0.07    0.09    0.09    0.08    0.01    
Test time         0.25    0.25    0.25    0.25    0.00    


According to the cross-validation results
*   SVD rmse is 1.06 which is better than KNN basic algos that I have used for CF.
*   NMF dont peform well as SVD with 1.29 , less than both of knn baisc.
*   Both are better than random

Now, I will focus on tunning the parameters of SVD ,by changing lr and epochs

In [14]:
print("Searching for best parameters...")
param_grid = {'n_epochs': [20, 30,50], 'lr_all': [0.001,0.005, 0.010]}
gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=3)
gs.fit(data)

Searching for best parameters...


In [17]:
# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Best RMSE score attained:  1.0660841145073692
{'n_epochs': 30, 'lr_all': 0.005}


Optimized SVD model
* Given the best parameters I have found , train a model and test the performance

In [23]:
params = gs.best_params['rmse']
svd_tunned_algo = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'])
trainset, testset = train_test_split(data, test_size=0.25)
svd_tunned_algo.fit(trainset)
svd_tunned_predictions = svd_tunned_algo.test(testset)
# Calculate and print RMSE
accuracy.rmse(svd_tunned_predictions)

RMSE: 1.0606


1.0606126112715855

I will use prcision@K and recall@K , samilar to knn algos.
*   The results are not as high as KNN-user based , but are better the random recommendation

In [27]:
predictions = svd_tunned_algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
# Precision and recall can then be averaged over all users
print(f'Preision@K for algo SVD is  {precision_at_k}')
print(f'Recall@K for algo SVD is  {precision_at_k}')

Preision@K for algo SVD is  0.7885869871903406
Recall@K for algo SVD is  0.7885869871903406


Fit SVD on the all dataset for application usage

In [61]:
# First train an SVD algorithm on the movielens dataset.
full_trainset = data.build_full_trainset()
svd_tunned_algo.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b3e7375f98>

Getting user-item ids for evaluating specific user

In [36]:
import json
with open('../outputs/IDs/ids_items_dict.json', 'r') as file:
    ids_items_dict = json.load(file)
with open('../outputs/IDs/usernames_ids_dict.json', 'r') as file:
    usernames_ids_dict = json.load(file)
with open('../outputs/IDs/ids_usernames_dict.json', 'r') as file:
    ids_usernames_dict = json.load(file)

For better evaluation of SVD i will user top_n recommendation of specific user.

In [62]:
def get_top_n(user_id,dataset,algo, n=10):
    # Get a list of all items and all users in the dataset
    items = dataset.all_items()
    # Convert raw ids to inner ids
    raw_user_id = user_id  # replace with the user ID you are interested in
    inner_user_id = dataset.to_inner_uid(raw_user_id)
    # Find items that the user has not rated yet
    test_items = [dataset.to_raw_iid(item) for item in items if not dataset.ur[inner_user_id].__contains__(item)]
    # Predict ratings for all the items not rated by the user
    predictions = [algo.predict(raw_user_id, item) for item in test_items]
    # Get the top N recommendations
    top_n_items = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    return top_n_items

For example I have take use named : "kristina" with historical items of 'Pet Supplies'. 

In [79]:
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])
userName='kristina'
#Get the items of specific userName#
df[df['userName']==userName]

Unnamed: 0,userName,itemName,brand,category,price,rating,vote,user_id,item_id
6875,kristina,Pet ID Tags 8 Lines Engraving Available Size S...,Providence Engraving,Pet Supplies,$2.99,5.0,0,15735,6297
7322,kristina,Chuckit Max Glow Ball,Chuckit,Pet Supplies,$5.95,5.0,0,15735,1785
54520,kristina,Blue Buffalo Wilderness High Protein Grain Fre...,Blue Buffalo,Pet Supplies,$33.99,5.0,0,15735,1139


When getting top_10 recommended items for kristina , we can see that not all items are related to pets , the top ones are more office products.
*  MF are are more exploratory , that's why not all items are excatly the same as the user's history.

In [77]:
user_id = usernames_ids_dict[userName]
#Get top n recommendation for specific user_id
top_n_predictions = get_top_n(user_id,full_trainset,svd_tunned_algo,10)
#Get top items ids
top_items_ids = [obj.iid for obj in top_n_predictions]
#Get top items rating
top_k_ratings = [obj.est for obj in top_n_predictions]
#Get items names according to item_ids
top_k_items_names = (ids_items_dict[str(it_id)] for it_id in top_items_ids)
for item in top_k_items_names:
    print(item)

Prismacolor 3598T Premier Colored Pencils Soft Core 48 Pack
Ethical Products SPOT Sponge Soccer Balls Cat Toy
PaperPro Executive Stapler 3 1 Stapler One Finger Effort Spring Powered Stapler 1110
Chuckit Max Glow Ball
Sherpa Travel Original Deluxe Airline Approved Pet Carrier
AmazonBasics Stainless Steel Dog Bowl
Anthony Arrowroot Powder 5lb Batch Tested
Temptations Mixup Treats Cats 16 Ounces
UM 153 Signo Broad Point Gel Pen White Pack 3
AmazonBasics Single Door amp Double Door Folding Metal Dog Crate


Save the trainset and the fitted algo 

In [80]:
# Save the trained model to a file
with open('../outputs/MF/svd_tunned_algo.sav', 'wb') as file:
    pickle.dump(svd_tunned_algo, file)
with open('../outputs/MF/trainset.sav', 'wb') as file:
    pickle.dump(full_trainset, file)