In [10]:
from surprise import SVD, NMF , NormalPredictor
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate,train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
import pandas as pd
import pickle

Matrix Factorization - user - item interactions
*   Loading the user_item_rating dataset that I have created in the last stage.
*   Using Random predictor as a baseline algorithem for recommendation of items to user.
*   Applying two algorithms of MF - SVD
*   MF algorithms aim to estimate/optimize the matrix U and V that will get minimum loss of the know rating of the user-items
*   Given U and V we can predict the unknown rating of all users and items.
*   The results of the algorithem will be validated by using RMSE score.


In [3]:
user_item_rating = pd.read_csv('../data/user_item_rating.csv')

Using MF algorithems and prform cross validation on the dataset.

In [9]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_item_rating[['user_id', 'item_id', 'rating']], reader)
svd_algo = SVD()
nmf_algo = NMF()
algos= [('SVD',svd_algo),('NMF',nmf_algo),('Random',NormalPredictor())]
for algo in algos :
    print(algo[0])
    cross_validate(algo[1], data, measures=['RMSE'], cv=3, verbose=True)

SVD
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0669  1.0737  1.0668  1.0691  0.0032  
Fit time          3.33    3.37    3.34    3.35    0.02    
Test time         0.24    0.24    0.18    0.22    0.03    
NMF
Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2950  1.2876  1.2960  1.2929  0.0038  
Fit time          5.05    4.98    5.04    5.02    0.03    
Test time         0.23    0.22    0.22    0.22    0.01    
Random
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4122  1.4156  1.4102  1.4127  0.0022  
Fit time          0.07    0.09    0.09    0.08    0.01    
Test time         0.26    0.24    0.25    0.25    0.01    


According to the cross-validation results
*   SVD rmse is 1.06 which is better than KNN basic algos that I have used for CF.
*   NMF dont peform well as SVD with 1.29 , less than both of knn baisc.
*   Both are better than random

Now, I will focus on tunning the parameters of SVD ,by changing lr and epochs

In [None]:
print("Searching for best parameters...")
param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010],
              'n_factors': [50, 100]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [27]:



trainset, testset = train_test_split(data, test_size=0.25,shuffle=False)
#Train the model on the entire dataset
user_knn_algo.fit(trainset)
item_knn_algo.fit(trainset)
random_algo.fit(trainset)

#Evaluate on the same dataset
user_knn_predictions = user_knn_algo.test(testset)
item_knn_predictions = item_knn_algo.test(testset)
random_knn_predictions = random_algo.test(testset)

# Calculate and print RMSE
accuracy.rmse(user_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(item_knn_predictions)
# Calculate and print RMSE
accuracy.rmse(random_knn_predictions)

RMSE: 1.1172


1.1171726781509062

The rmse is rasing to 1.11 if we split the dataset without shuffling which shows that the dataset is time-dependent.
I will save the results that have been calculated for training on the whole dataset , for using in the app.

In [37]:
def get_top_n(user_id,trainset,algo, n=10):
    # Get a list of all items and all users in the dataset
    items = trainset.all_items()
    users = trainset.all_users()
    # Convert raw ids to inner ids
    raw_user_id = user_id  # replace with the user ID you are interested in
    inner_user_id = trainset.to_inner_uid(raw_user_id)
    # Find items that the user has not rated yet
    test_items = [trainset.to_raw_iid(item) for item in items if not trainset.ur[inner_user_id].__contains__(item)]
    # Predict ratings for all the items not rated by the user
    predictions = [algo.predict(raw_user_id, item) for item in test_items]
    # Get the top N recommendations
    top_n_items = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    return top_n_items

Fit SVD on the full dataset

In [29]:
# First train an SVD algorithm on the movielens dataset.
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2833b32a470>

Save the trainset and the fitted algo 

In [36]:
# Save the trained model to a file
with open('../outputs/CF/algo_svd.sav', 'wb') as file:
    pickle.dump(algo, file)
with open('../outputs/CF/trainset.sav', 'wb') as file:
    pickle.dump(trainset, file)