## Load built in dataset

In [None]:
# from surprise import Dataset

# Load the movielens-100k dataset (download it if needed),
# data = Dataset.load_builtin('ml-100k')

## Create Custom data

In [10]:
import pandas as pd
#each line needs to respect the form of -> user ; item ; rating ; [timestamp]

ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 9],
                'rating': [3, 2, 4, 3, 1]}

columns=['userID', 'itemID', 'rating']

df = pd.DataFrame(ratings_dict)

df=df[columns]

print(df,"\n")

   userID  itemID  rating
0       9       1       3
1      32       1       2
2       2       1       4
3      45       2       3
4       9       2       1 



### Read data into Surprise package format

In [17]:
from surprise import Reader

lower_rating=df['rating'].min()
higher_rating=df['rating'].max()

print("In the dataset, Lowest rating is {1} and Highest rating is {0}".format(higher_rating,lower_rating))

print("\nSetting rating scale accordingly")

# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(lower_rating, higher_rating))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In the dataset, Lowest rating is 1 and Highest rating is 4

Setting rating scale accordingly


## Build Model

List of all available models [here](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html) inlcuding [ALS and SGD baselines](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html)

In [24]:
from surprise import SVD,SVDpp
from surprise.model_selection import cross_validate
import numpy as np

algo = SVD()
# or 
# algo = SVDpp()

algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4efc3b8fd0>

## Test Model

### For a specific user_id and item_id

In [79]:
pred=algo.predict(uid=45,iid=1)
print(pred)
score=pred.est
print("Estimated rating:",score)

user: 45         item: 1          r_ui = None   est = 2.86   {'was_impossible': False}
Estimated rating: 2.864370356352802


### For a specific user_id and top 'n' unrated item_ids

In [78]:
# Get a list of all item IDs
iids=df['itemID'].unique()

# Get a list of Items that a perticular user has rated
uid=46
iids_rated=df.loc[df['userID']==uid,'itemID']

# Get a list of Items that a perticular user has not rated
iids_to_pred=np.setdiff1d(iids,iids_rated)

print("User_id",uid,"hasn't rated Item_ids",iids_to_pred)

# Set sample data for model to predict
random_rating=4 # can be any value just to complete dataframe
test_set=[[uid,iid,random_rating] for iid in iids_to_pred]

# Get model predictions as a list of prediction objects
predictions=algo.test(test_set)

# Get predictions
predicted_ratings=[pred.est for pred in predictions]

# Sort predictions

list1, list2 = (list(t) for t in zip(*sorted(zip(predicted_ratings, list(iids_to_pred)))))
list1.reverse()
list2.reverse()
predicted_ratings=np.array(list1)
iids_to_pred=np.array(list2)

# Display predictions

num_top=3

# if there is a dict of indices to item names that can be used here
df_pred=[[uid,iids_to_pred[i],predicted_ratings[i]] for i in range(0,min(num_top,len(iids_to_pred)))]
df_pred = pd.DataFrame(df_pred,columns=columns)

df_pred

User_id 46 hasn't rated Item_ids [1 2]


Unnamed: 0,userID,itemID,rating
0,46,1,2.698707
1,46,2,2.514947


### Using Grid Search to tune model parameters

* Get a full list of tunable SVD parameters [here](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#matrix-factorization-based-algorithms)
* Get possible model evaluation metrics [here](https://surprise.readthedocs.io/en/stable/accuracy.html)

In [88]:
# Using grid search CV to get the best model estimates
from surprise.model_selection import GridSearchCV
from surprise import SVD

param_grid={'lr_all':[0.001,0.01],'reg_all':[0.1,0.5]}

# possible measures - 'rmse','mae','fcp'
gs = GridSearchCV(SVD,param_grid,measures=['rmse'],cv=3)
gs.fit(data)

print(gs.best_params['rmse'])

{'reg_all': 0.1, 'lr_all': 0.001}


In [89]:
final_algo = gs.best_estimator['rmse']

In [92]:
# Run 5-fold cross-validation and print results
result=cross_validate(final_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5328  1.9609  0.5228  0.7755  1.7363  1.1057  0.6174  
MAE (testset)     0.5328  1.9609  0.5228  0.7755  1.7363  1.1057  0.6174  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


In [None]:
# Ask Karthick general procedure of selecting and building a model
# How is cross validation applied
# More specifically how is training and test data separated
# How is final model choosen and scores reported

In [None]:
Use grid search CV on entire dataset

Get best hyper parameters

Run cross validate using the best model