<a href="https://colab.research.google.com/github/VienneseWaltz/Movie_Recommendation_Analysis/blob/main/movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
!pip3 install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 kB[0m [31m23.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096310 sha256=91190401be70bf999c691cbb5663e7259fb4cb2f34daecf9b5a4da1ee260f4dc
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise


In [None]:
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [None]:
from plotly.offline import init_notebook_mode, plot
import plotly.graph_objs as go

In [None]:
# Reading in the data
ratings_df = pd.read_csv('ratings.csv')

In [None]:
# Display the first 5 rows
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Display the last 5 rows
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [None]:
# Examine how many rows and columns this DataFrame is
ratings_df.shape

(100836, 4)

In [None]:
# This is a 100836 rows x 4 columns DataFrame. Display the 4 columns
ratings_df_cols = ratings_df.columns.tolist()
ratings_df_cols

['userId', 'movieId', 'rating', 'timestamp']

In [None]:
# Drop the colummn 'timestamp'
ratings_df.drop(columns = ['timestamp'], axis=1, inplace=True)
ratings_df_cols = ratings_df.columns.tolist()
ratings_df_cols

['userId', 'movieId', 'rating']

In [None]:
# Check for missing values.
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [None]:
# Yay! No missing values. Display the data types of the 3 different columns 'userID', 'movieId' and 'rating'.
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [None]:
# Print the shape and the ratings_df in steps of 20000
print('Dataset shape: {}'.format(ratings_df.shape))
print('******** Dataset examples ********')
print(ratings_df.iloc[::20000, :])

Dataset shape: (100836, 3)
******** Dataset examples ********
        userId  movieId  rating
0            1        1     4.0
20000      132     1079     3.5
40000      274     5621     2.0
60000      387     6748     3.0
80000      501       11     3.0
100000     610     6978     4.0


**Exploratory Data Analysis (EDA)**

In [None]:
# Ratings Distribution
# Display the number of different values of ratings using value_counts().
# Sort the ratings values - 5.0 to 0.5 - in descending order.
# Refer to https://community.plotly.com/t/plotly-colours-list/11730/2 for plotly colours list
data = ratings_df['rating'].value_counts().sort_index(ascending=False)

trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values/ratings_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = 'aliceblue', size=14, family='Times New Roman'),
               y = data.values)

# Create layout
layout = dict(autosize = False,
              width = 900,
              height = 900,
              title = 'Distribution of {} ratings'.format(ratings_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()



**Ratings Distribution by MovieId**

In [None]:
# Group by 'movieID' and set the upper limit of ratings in the 'rating' column to be 50.
data = ratings_df.groupby('movieId')['rating'].count().clip(upper=50)

In [None]:
data

movieId
1         50
2         50
3         50
4          7
5         49
          ..
193581     1
193583     1
193585     1
193587     1
193609     1
Name: rating, Length: 9724, dtype: int64

In [None]:
# Create histogram trace
trace = go.Histogram( x = data.values,
                      name = 'Ratings',
                      xbins = dict(start = 0,
                                   end = 50,
                                   size = 2))

# Create layout
layout = go.Layout(autosize = False,
                   width = 1000,
                   height = 1000,
                   title = 'Distribution of Number of Ratings Per Item (Clipped at 50)',
                   xaxis = dict(title = 'Number of Ratings Per movieId'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [None]:
# Group by 'movieId', and count of the 'rating' column. Then reset_index() starts counting from 0 to 9723. Sort in descending
# order and display the first 15 rows
ratings_df.groupby('movieId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:15]

Unnamed: 0,movieId,rating
314,356,329
277,318,317
257,296,307
510,593,279
1938,2571,278
224,260,251
418,480,238
97,110,237
507,589,224
461,527,220


**Ratings Distribution by Users**

In [None]:
# Group by 'userId' and clip (i.e. set upper limit) the ratings per user to be 50.
data = ratings_df.groupby('userId')['rating'].count().clip(upper=50)

In [None]:
data

userId
1      50
2      29
3      39
4      50
5      44
       ..
606    50
607    50
608    50
609    37
610    50
Name: rating, Length: 610, dtype: int64

In [None]:
# Create histogram trace
go.Histogram(x = data.values,
               name = 'Ratings',
               xbins = dict(start = 0,
                              end = 50,
                             size = 2))

# Create layout
layout = go.Layout(autosize = False,
                   width = 1000,
                   height = 1000,
                   title = 'Distribution of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()


In [None]:
ratings_df.groupby('userId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,userId,rating
413,414,2698
598,599,2478
473,474,2108
447,448,1864
273,274,1346
609,610,1302
67,68,1260
379,380,1218
605,606,1115
287,288,1055


**Dimensionality Reduction**

In [None]:
# To reduce the dimensionality of the dataset, let's filter out rarely rated movies and rarely rating users.
min_ratings = 5

# Returns a Series of True/False of movies (identified by 'movieId') that are rated more than 5 times
filter_movies = ratings_df['movieId'].value_counts() > min_ratings

# Convert the series to a list
filter_movies = filter_movies[filter_movies].index.tolist()

In [None]:
min_user_ratings = 5
filter_users = ratings_df['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

In [None]:
ratings_df_new = ratings_df[(ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))]
print('The original DataFrame shape is: \t{}'.format(ratings_df.shape))
print('The new ratings Dataframe shape is: \t{}'.format(ratings_df_new.shape))


The original DataFrame shape is: 	(100836, 3)
The new ratings Dataframe shape is: 	(88364, 3)


**Surprise**

To load a dataset from a Pandas DataFrame, we need a Reader object. We will use the load_from_df() method and the rating_scale parameter must be specified. The columns must correspond to 'userId', 'movieId' and 'rating' (in that order). Refer to
https://surprise.readthedocs.io/en/stable/getting_started.html#load-from-df-example

In [None]:
reader = Reader(rating_scale = (1,5))
data = Dataset.load_from_df(ratings_df_new[['userId', 'movieId', 'rating']], reader)

We can now use this ratings_df_new dataset and benchmark with the following algorithms. We will use rmse as our accuracy metric for the predictions.

**Basic Algorithms**

With the Surprise library, we will benchmark the following algorithms.

**NormalPredictor**

Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.

**BaselineOnly**

Algorithm predicting the baseline estimate for given user and item.

**kNN Algorithms**

**KNNBasic**

A basic collaborative filtering algorithm.

**KNNWithMeans**

A basic collaborative filtering algorithm, taking into account
the mean ratings of each user.

**KNNWithZScore**

A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

**KNNBaseline**

A basic collaborative filtering algorithm taking into account a baseline rating.

**Matrix Factorization-based Algorithms**

**SVD**

The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. This algorithm is equivalent to the Probabilistic Matrix Factorization: https://proceedings.neurips.cc/paper_files/paper/2007/file/d7322ed717dedf1eb4e6e52a37ea7bcd-Paper.pdf

**SVDpp**

The SVDpp algorithm, an extension of SVD taking into account implicit ratings.

**NMF**

A collaborative filtering algorithm based on Non-negative Matrix Factorization.

**Slope One**

A simple yet accurate collaborative filtering algorithm. SlopeOne Algorithm: https://arxiv.org/abs/cs/0702144



**Co-clustering**

A collaborative filtering algorithm based on co-clustering.

In [None]:
# Iterate over all the abovementioned algorithms
# Referred to https://gist.github.com/susanli2016/e0cdcf1bca69a2b144fd8c04f30b522f for inspiration
benchmark = []

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
  print("Starting: ", str(algorithm))
  # Perform cross-validation
  results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)

  # Obtain results and append algorithm name
  # Getting the average (of results) along the rows
  temp = pd.DataFrame.from_dict(results).mean(axis=0)
  #temp = temp.append(pd.Series([str(algorithm).split('')[-1]], index=['Algorithm']))
  temp = temp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
  benchmark.append(temp)
  print("Done:", str(algorithm), "\n\n")

print('\n\tDONE Iterating through all algorithms\n')




Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7ff7faa7f8e0>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7ff7faa7d600>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7ff7faa7d4e0>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7ff7faa7e290>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7ff7faa7e230>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7ff7faa7e2c0>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x7ff7faa7e830>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7ff7faa7e7a0>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7ff7faa7fc10>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7ff7faa7fc70>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7ff7faa7fcd0>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7ff7faa


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7ff7faa7f8e0> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7ff7faa7d600>



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7ff7faa7d600> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7ff7faa7d4e0>



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7ff7faa7d4e0> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7ff7faa7e290>



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7ff7faa7e290> 


Starting:  <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7ff7faa7e230>



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7ff7faa7e230> 


Starting:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7ff7faa7e2c0>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7ff7faa7e2c0> 


Starting:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x7ff7faa7e830>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.knns.KNNBasic object at 0x7ff7faa7e830> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7ff7faa7e7a0>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7ff7faa7e7a0> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7ff7faa7fc10>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7ff7faa7fc10> 


Starting:  <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7ff7faa7fc70>
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Done: <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7ff7faa7fc70> 


Starting:  <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7ff7faa7fcd0>
Done: <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7ff7faa7fcd0> 



	DONE Iterating through all algorithms




The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.848335,38.790003,13.210362
KNNBaseline,0.856768,0.422788,3.101048
BaselineOnly,0.861641,0.326457,0.22099
SVD,0.864085,1.328274,0.361926
KNNWithZScore,0.868274,0.225492,2.810952
KNNWithMeans,0.868336,0.138553,2.581426
SlopeOne,0.872205,1.411451,9.477628
NMF,0.898137,2.200644,0.396599
CoClustering,0.921517,2.286504,0.269639
KNNBasic,0.922087,0.098928,2.179287


SVDpp is performing the best, but the fitting time and testing time are also the longest. We will apply GridSerachCV to SVD instead.

In [None]:
# Using a smaller grid for testing
param_grid = {
    "n_epochs": [10, 20],
    "lr_all" : [0.002, 0.005],
    "reg_all" : [0.02]
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=5)
gs.fit(data)
training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.8552387535253126
BEST MAE: 	 0.6559434975966589
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [None]:
from datetime import datetime
print(training_parameters)
reader = Reader(rating_scale=(1,5))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
data = Dataset.load_from_df(ratings_df_new[['userId', 'movieId', 'rating']], reader)
print("> OK DONE Loading data")

print("> Creating training set...")
trainset = data.build_full_trainset()
print("> OK DONE Creating training set")

# Getting current year, month, day and time
startTraining = datetime.now()
print("> Training...")

algo = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It took: ", (endTraining - startTraining).seconds, "seconds to train")

end = datetime.now()
print("> DONE \t\t This whole operation from loading data to creating training set to training took: ", (end-start).seconds, "seconds")

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


		 STARTING


> Loading data...
> OK DONE Loading data
> Creating training set...
> OK DONE Creating training set
> Training...
> OK 		 It took:  3 seconds to train
> DONE 		 This whole operation from loading data to creating training set to training took:  3 seconds


In [None]:
# Saving Trained Model
from surprise import dump
import os
model_filename = "./model.pickle"
print(">> Starting dump...")
# Dump algorithm and reload it.
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=algo)
print(">> Dump done.")
print(model_filename)


>> Starting dump...
>> Dump done.
./model.pickle


In [None]:
# Load saved model
def load_model(model_filename):
  print(">> Loading dump")
  file_name = os.path.expanduser(model_filename)
  _, loaded_model = dump.load(file_name)
  print(">> Loaded dump")
  return loaded_model

In [None]:
# Prediction
from pprint import pprint as pp
model_filename = "./model.pickle"
def itemRating(user, item):
    uid = str(user)
    iid = str(item)
    loaded_model = load_model(model_filename)
    prediction = loaded_model.predict(user, item, verbose=True)
    rating = prediction.est
    details = prediction.details
    uid = prediction.uid
    iid = prediction.iid
    true = prediction.r_ui
    ret = {
        'user': user,
        'item': item,
        'rating': rating,
        'details': details,
        'uid': uid,
        'iid': iid,
        'true': true
        }
    pp (ret)
    print ('\n\n')
    return ret
print(itemRating(user = "610", item = "10"))


>> Loading dump
>> Loaded dump
user: 610        item: 10         r_ui = None   est = 3.54   {'was_impossible': False}
{'details': {'was_impossible': False},
 'iid': '10',
 'item': '10',
 'rating': 3.543813091304151,
 'true': None,
 'uid': '610',
 'user': '610'}



{'user': '610', 'item': '10', 'rating': 3.543813091304151, 'details': {'was_impossible': False}, 'uid': '610', 'iid': '10', 'true': None}
