# Introduction 
* For this assignment, you need to benchmark the following models using RMSE, MAE and fit-time / prediction-time:

* * User-based CF with cosine similarity.
* * User-based CF with pearson correlation similarity.
* * Item-based CF with cosine similarity.
* * Item-based CF with pearson correlation similarity.
* * SVD
* * NMF
* Note :
** You should visualize your benchmarking results with a bar chart and to interepret them
** You shouldn't implement the models from scratch, you need to use their surprise implementation.

* Notebook :

* Your notebook should be leasable, well organized and commented. It should contain 3 separate parts :

* [Data loading]
* [Utils]
* [Model benchmarking]

In [1]:
import sys
import os
import surprise
from surprise import Dataset
from pathlib import Path
from surprise import Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.prediction_algorithms import SVD
from surprise import accuracy
from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.similarities import cosine, msd, pearson
import time
import pandas as pd 
from tabulate import tabulate
import plotly.graph_objects as go
import matplotlib.pyplot as plt

C:\Users\npram\anaconda3\envs\tenpy\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\npram\anaconda3\envs\tenpy\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


# Data loading

* This is data loading section: The function (load_data) will download the ml-100k file which contains dataset otherwise if we have data we can import by path file

In [2]:
def load_data(load_from_surprise: bool) -> pd.DataFrame:
    if load_from_surprise == True:
        try:
            
            dataframe = Dataset.load_builtin('ml-100k')
            
        except EOFError as e:
            print(e)   
    elif load_from_surprise == False:
        user_input = input("Enter the path of the dataset: ")
        assert os.path.exists(user_input), "I did not find the file at, "+str(user_input)
        filepath = user_input
        reader = Reader(line_format='user item rating timestamp', sep=",", skip_lines= 1)
        dataframe = Dataset.load_from_file(filepath, reader)

    return dataframe

In [3]:
dataframe = load_data(False)

# Utils

In [26]:
def util(algorithm: str,model_kwargs: dict) -> float:
    #Split data
    trainset, testset = train_test_split(dataframe, 0.2, random_state=42)
    # print(len(testset))

    #train model
    if algorithm == 'KNN':
        model = knns.KNNBaseline(sim_options = model_kwargs['sim_options'])
    # elif algorithm == 'grid_KNN':
    #     param_grid = {'K' : model_kwargs['K'], 'sim_options' : model_kwargs['sim_options']}
    #     grid_model = GridSearchCV(knns.KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
    elif algorithm == 'SVD':
        model = SVD(n_factors=model_kwargs['n_factors'], n_epochs=model_kwargs['n_epochs'], lr_all=model_kwargs['lr_all'], reg_all= model_kwargs['reg_all'])
    elif algorithm == 'NMF':
        model = NMF(n_factors=model_kwargs['n_factors'], n_epochs=model_kwargs['n_epochs'])
    else:
        print('Please upload the model with parameters....!')
    

    # Model fit and running time
    start_time = time.time()
    # if algorithm == 'Grid_SVD':
    #     grid_model.fit(dataframe)
    # else:
    model.fit(trainset)
    fit_time = ('model running Fit time is %s seconds' % (round(time.time() - start_time,4)))
    #Model Prediction and running time
    start_time = time.time()
    prediction = model.test(testset)
    pred_time = ('model running Prediction time is %s seconds' % (round(time.time() - start_time,4)))

    # Model Score and running time
    start_time = time.time()
    # if algorithm == 'Grid_SVD':
    #     RMSE_result = grid_model.best_score['rmse']
    #     MAE_result = grid_model.best_score['mae']
    # else:
    RMSE = accuracy.rmse(prediction)
    MAE = accuracy.mse(prediction)
    RMSE_result = ('RMSE %s ' % RMSE)
    MAE_result = ('MAE: %s ' % MAE)
    pred_time = ('model running Score time is %s seconds' % (round(time.time() - start_time,4)))

    return model,fit_time, pred_time, RMSE_result, MAE_result

    
        

# Bench Mark

In [27]:
def benchmark(model_parameters: dict) -> (pd.DataFrame, dict):    
    result_list = []
    for i in model_parameters.keys():        
        items = model_parameters[i]
        model,fit_time, pred_time, RMSE_result, MAE_result = util(algorithm=items['algorithm'],model_kwargs=items['model_kwargs'])
        column = [i, re.findall("\d+\.\d+",fit_time)[0], re.findall("\d+\.\d+",pred_time)[0],re.findall("\d+\.\d+",RMSE_result)[0],re.findall("\d+\.\d+",MAE_result)[0]]
        tabulate([column], tablefmt="pipe")
        result_list.append(column)
    header = ['Name','Model_fit_time','Model prediction time','RMSE','MAE']
    dataframe = pd.DataFrame(result_list)
    dataframe.columns = header        
    return dataframe

In [28]:
plot_benchmark = benchmark(model_parameters= {'KNN (Cosine User based)':
                                               {'algorithm': 'KNN', 'model_kwargs': {'sim_options': {'user_based': True, 'name': 'cosine'}}},
                                               'KNN (Cosine Item based)':
                                               {'algorithm': 'KNN', 'model_kwargs': {'sim_options': {'user_based': False, 'name': 'cosine'}}},
                                               'KNN (Pearson User based)':
                                               {'algorithm': 'KNN', 'model_kwargs': {'sim_options': {'user_based': True, 'name': 'pearson'}}},
                                               'KNN (Pearson Item based)':
                                               {'algorithm': 'KNN', 'model_kwargs': {'sim_options': {'user_based': False, 'name': 'pearson'}}},
                                               'SVD':
                                               {'algorithm': 'SVD', 'model_kwargs':{'n_factors':100, 'n_epochs':10, 'lr_all':0.005,'reg_all':0.4}},
                                               'NMF':
                                               {'algorithm': 'NMF', 'model_kwargs':{'n_factors':15, 'n_epochs': 50}}
                                               })

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9317
MSE: 0.8681
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9383
MSE: 0.8804
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9302
MSE: 0.8652
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9378
MSE: 0.8795
RMSE: 0.9634
MSE: 0.9280
RMSE: 0.9662
MSE: 0.9336


In [29]:
plot_benchmark

Unnamed: 0,Name,Model fit time,Model prediction time,RMSE,MAE
0,KNN (Cosine User based),2.8355,0.05,0.9317096171273572,0.8680828106476068
1,KNN (Cosine Item based),4.3613,0.047,0.9383018447759258,0.8804103519099057
2,KNN (Pearson User based),3.6925,0.043,0.930153095002855,0.8651847801433902
3,KNN (Pearson Item based),5.2674,0.053,0.9378091278978098,0.8794859603684506
4,SVD,3.2907,0.054,0.9633511045935124,0.9280453507215408
5,NMF,6.7583,0.047,0.9662303970239026,0.9336011801329684


In [62]:
param_grid = {'K': [40,60,70], 'sim_options': {'user_based': [True, False], 'name': ['cosine', 'pearson']}}
gs_svd = GridSearchCV(knns.KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
gs_svd.fit(dataframe)
# svd = gs_svd.best_estimator['rmse']
RMSE_grid = gs_svd.best_score['rmse']
MSE_grid = gs_svd.best_score['mae']
best_params = gs_svd.best_params

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Co

In [63]:
RMSE_grid

0.9314607527615134

In [65]:
MSE_grid

0.732154278932733

In [82]:
best_params

{'rmse': {'K': 40, 'sim_options': {'user_based': True, 'name': 'pearson'}},
 'mae': {'K': 40, 'sim_options': {'user_based': True, 'name': 'pearson'}}}

* The fit time is more than any other model it takes more than 5-10 minutes 

# Ploting the graphs

In [81]:
import plotly.express as px
fig = px.bar(plot_benchmark, x='Name', y='Model fit time', title="Model fit time compare")
fig.show()

In [56]:
import plotly.express as px
fig = px.bar(plot_benchmark, x='Name', y='Model prediction time', range_y=[0,5], title="Model prediction time compare")
fig.show()

In [84]:
fig = px.bar(plot_benchmark, x='Name', y='RMSE', title="Comparing the RMSE score")
fig.show()

In [91]:
fig = px.bar(plot_benchmark, x='Name', y='MAE', title="Comparing the MAE score")
fig.show()