# Recommendation System for a new movie streaming service

## Busines Case

#### a) The new service would like an improved recommendation engine for it's current customers to stop the subscription churn
#### b) They would like some informed guidance on the type of movie they should be investing in
#### c) They would like an interface for new users who can see the kind of movies they might be interested in on the service

In [10]:
import pandas as pd
from surprise import Reader, Dataset

In [11]:
!ls

Collaborative Rec Sys Mod4 Project.ipynb
Content Based Movie Recomender.ipynb
Sigmoid recommender alt data set.ipynb
Surpris Lab mk11 UNBIASED.ipynb
[34mml-latest-small[m[m
[34mtmdb-movie-metadata[m[m


In [12]:
df = pd.read_csv('./ml-latest-small/ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [13]:
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [14]:
df.shape

(100836, 4)

In [15]:
new_df = df.drop(columns=['timestamp'])
new_df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [16]:
reader = Reader()
movies = Dataset.load_from_df(new_df,reader)
movies

<surprise.dataset.DatasetAutoFolds at 0x11ab271d0>

In [17]:
#Extract raw ratings from the dataset
raw_ratings = movies.raw_ratings

In [18]:
import random

In [19]:
random.shuffle(raw_ratings)

In [20]:
raw_ratings[:5]

[(449, 50, 4.5, None),
 (274, 8360, 4.0, None),
 (170, 350, 3.0, None),
 (414, 308, 4.0, None),
 (599, 26782, 2.0, None)]

In [21]:
#We want to split our ratings data into with 80/20 ratio
threshold = int(.8*len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

In [22]:
movies.raw_ratings = A_raw_ratings #The data is now the A set

In [23]:
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

## Base Model using SVD and GridSearchCV

In [34]:
#Now we'll investigate some optimization with GridSearch CV
param_grid = {'n_factors':[5,20,100], 'n_epochs':[5,10], 'lr_all':[0.002, 0.005], 'reg_all':[0.02, 0.05, 0.5]}
gs_model = GridSearchCV(SVD, param_grid=param_grid, n_jobs=-1, joblib_verbose=3)
gs_model.fit(movies)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.3min finished


In [30]:
print(gs_model.best_score)
print(gs_model.best_params)

{'rmse': 0.8817626184585396, 'mae': 0.6800445780532369}
{'rmse': {'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.02}, 'mae': {'n_factors': 5, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.02}}


## Cross Validations

### We'll start withh the K-Nearest Neighbors Basic algorithm

In [37]:
#Cross validate with KNN Basic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, movies, n_jobs=-1)

In [38]:
for i in cv_knn_basic.items():
    print(i)
print('------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.98011587, 0.99693261, 0.98258366, 0.98413341, 0.98990697]))
('test_mae', array([0.7583604 , 0.7684409 , 0.75739477, 0.76228023, 0.76370132]))
('fit_time', (0.27557873725891113, 0.2766141891479492, 0.27100324630737305, 0.2721688747406006, 0.27719664573669434))
('test_time', (0.7502367496490479, 0.7317829132080078, 0.7414989471435547, 0.7154338359832764, 0.7233130931854248))
------------------
0.98673450360702


In [39]:
knn_basic_msd = KNNBasic(sim_options = {'name':'msd', 'user-based':True})
cv_knn_basic_msd = cross_validate(knn_basic_msd, movies, n_jobs=-1)                                   

In [40]:
for i in cv_knn_basic_msd.items():
    print(i)
print('------------------')
print(np.mean(cv_knn_basic_msd['test_rmse']))

('test_rmse', array([0.96104763, 0.96531222, 0.9572099 , 0.95525152, 0.95782755]))
('test_mae', array([0.73794661, 0.73633533, 0.73485218, 0.73270163, 0.73478438]))
('fit_time', (0.04732489585876465, 0.06029200553894043, 0.04921293258666992, 0.04758191108703613, 0.04714512825012207))
('test_time', (0.7474460601806641, 0.7862210273742676, 0.7575018405914307, 0.7589201927185059, 0.7693719863891602))
------------------
0.9593297632460018


### Next we'll try the KNN Baseline algorithm

In [41]:
#Cross validate with KNN Baseline (pearson)
knn_baseline = KNNBaseline(sim_options={'name': 'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline, movies, n_jobs=-1)

In [42]:
for i in cv_knn_baseline.items():
    print(i)
print('-----------------')
print(np.mean(cv_knn_baseline['test_rmse']))

('test_rmse', array([0.89281547, 0.89508754, 0.88409029, 0.89291285, 0.8839083 ]))
('test_mae', array([0.68227875, 0.6820536 , 0.67690286, 0.68230846, 0.67520419]))
('fit_time', (0.3129270076751709, 0.3136558532714844, 0.30928683280944824, 0.32314085960388184, 0.306243896484375))
('test_time', (0.9831299781799316, 1.0026578903198242, 1.025925874710083, 0.9911098480224609, 0.9960618019104004))
-----------------
0.8897628923766211


In [43]:
#Cross validate with KNN Baseline (mean square difference)
knn_baseline_msd = KNNBaseline(sim_options={'name':'msd', 'user_based':True})
cv_knn_baseline_msd = cross_validate(knn_baseline_msd, movies, n_jobs=-1)

In [49]:
for i in cv_knn_baseline_msd.items():
    print(i)
print('-----------------')
print(np.mean(cv_knn_baseline_msd['test_rmse']))

('test_rmse', array([0.88223368, 0.89084087, 0.88731034, 0.88899057, 0.88278093]))
('test_mae', array([0.67735611, 0.67624797, 0.67867266, 0.67612274, 0.67520431]))
('fit_time', (0.08727693557739258, 0.09384489059448242, 0.09236598014831543, 0.09471821784973145, 0.08886194229125977))
('test_time', (1.0900030136108398, 1.0773091316223145, 1.0877001285552979, 1.0861477851867676, 1.0049102306365967))
-----------------
0.886431277440167


In [51]:
#Cross validate with KNN Baseline (pearson_baseline)
knn_pearson_baseline = KNNBaseline(sim_options={'name':'pearson_baseline', 'user_based':True})
cv_knn_pearson_baseline = cross_validate(knn_pearson_baseline, movies, n_jobs=-1)

In [52]:
for i in cv_knn_pearson_baseline.items():
    print(i)
print('-----------------')
print(np.mean(cv_knn_pearson_baseline['test_rmse']))

('test_rmse', array([0.89342926, 0.88956674, 0.89123134, 0.90144818, 0.88875208]))
('test_mae', array([0.677976  , 0.67781395, 0.67923366, 0.68740546, 0.67666962]))
('fit_time', (0.23274493217468262, 0.23860907554626465, 0.23508787155151367, 0.24143314361572266, 0.22362399101257324))
('test_time', (0.9968037605285645, 0.9429259300231934, 0.9261612892150879, 0.9253566265106201, 0.9288880825042725))
-----------------
0.8928855201446251


### And finally we'll try KNN With Means algorithm

In [45]:
#Cross validate with KNN With Meanas (Pearson)
knn_means = KNNWithMeans(sim_options={'name':'pearson', 'user_based':True})
cv_knn_means = cross_validate(knn_means, movies, n_jobs=-1)

In [46]:
for i in cv_knn_means.items():
    print(i)
print('-----------------')
print(np.mean(cv_knn_means['test_rmse']))

('test_rmse', array([0.92210312, 0.89967619, 0.90652067, 0.91647664, 0.91363248]))
('test_mae', array([0.7005456 , 0.68589484, 0.69125293, 0.69749073, 0.69313747]))
('fit_time', (0.28046298027038574, 0.30387187004089355, 0.28382229804992676, 0.2925753593444824, 0.2912709712982178))
('test_time', (0.848956823348999, 0.8308649063110352, 0.8434996604919434, 0.8377389907836914, 0.8089640140533447))
-----------------
0.9116818198811696


In [47]:
knn_means_msd = KNNWithMeans(sim_options={'name':'msd', 'user_based':True})
cv_knn_means_msd = cross_validate(knn_means_msd, movies, n_jobs=-1)

In [48]:
for i in cv_knn_means_msd.items():
    print(i)
print('-----------------')
print(np.mean(cv_knn_means_msd['test_rmse']))

('test_rmse', array([0.90034021, 0.91509941, 0.9159398 , 0.90839119, 0.90073476]))
('test_mae', array([0.68564663, 0.69902919, 0.69775548, 0.69065748, 0.69109206]))
('fit_time', (0.05832219123840332, 0.06077003479003906, 0.07342219352722168, 0.05553388595581055, 0.061615943908691406))
('test_time', (0.8798611164093018, 0.8598320484161377, 0.8623948097229004, 0.8658139705657959, 0.8569252490997314))
-----------------
0.9081010734566288


## Tune Best Algorithm with GridSearchCV

Our best model above was KNN Baseline with Mean Squared Distance

In [54]:
param_grid = {'n_factors':[5,20,100], 'n_epochs':[5,10], 'lr_all':[0.002, 0.005], 'reg_all':[0.02, 0.05, 0.5]}
knn_gs = GridSearchCV(KNNBaseline, param_grid=param_grid, n_jobs=-1, joblib_verbose=3)
knn_gs.fit(movies)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.2min finished


In [55]:
print(knn_gs.best_score)
print(knn_gs.best_params)

{'rmse': 0.8838203956764012, 'mae': 0.6749265261385646}
{'rmse': {'n_factors': 5, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.02}, 'mae': {'n_factors': 5, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.02}}


The inintial SVD model stayed at the top of the performance list so we will use that to fit test set. 

In [35]:
#create our high performance algorithm
our_algo = gs_model.best_estimator['rmse']

In [36]:
#Retrain on the whole set A
trainset = movies.build_full_trainset()
our_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11dcc5b70>

In [39]:
from surprise import accuracy

In [44]:
#This is the BIASED Accuracy on A
predictions = our_algo.test(trainset.build_testset())
print('Biased accuract on A', end='  ')
accuracy.rmse(predictions)

Biased accuract on A  RMSE: 0.8416


0.8415655196781596

In [45]:
#This is the UNBIASED Accuract on B
testset = movies.construct_testset(B_raw_ratings) #testset is now the set B
predictions = our_algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

Unbiased accuracy on B, RMSE: 0.8834


0.8834117006142889