# Recommendation System

Building recommendation system to scale using scikit-surprise (surprise library)

## Installing required libraries

In [None]:
!pip3 install scikit-surprise

## Importing required libraries

In [None]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [None]:
from datetime import datetime
globalstart = datetime.now()

## 1. Loading data into dataframe

In [None]:
raw_data_path = "./data/raw"
processed_data_path = "./data/processed"
models_path = "./models"

In [None]:
data_file_path = processed_data_path + "/" + 'data_sliced.csv'
movie_titles_path = processed_data_path + "/" + 'movies_sliced.csv'

In [None]:

start = datetime.now()
print("creating the dataframe from {} file..".format(data_file_path))

df = pd.read_csv(data_file_path)

df.date = pd.to_datetime(df.date)  
print('Done.\n')

print(datetime.now() - start)

#arranging the ratings according to time.
print('Sorting the dataframe by date..')
df.sort_values(by='date', inplace=True)
print('Done..')

print("Time taken:", datetime.now() - start)  

creating the dataframe from ./data/processed/data_sliced.csv file..
Done.

0:00:00.634011
Sorting the dataframe by date..
Done..
Time taken: 0:00:00.717011


In [None]:

movie_titles = pd.read_csv(processed_data_path + "/" +  'movies_sliced.csv')
movie_titles.head(2)

Unnamed: 0,movieId,year_of_release,title
0,30,2003.0,Something's Gotta Give
1,175,1992.0,Reservoir Dogs


In [None]:
df.head()

Unnamed: 0,movieId,userId,rating,date
0,16242,2248080,3,1999-12-30
10,9189,2248080,3,1999-12-30
9,175,2248080,2,1999-12-30
8,14725,2248080,4,1999-12-30
7,10451,2248080,4,1999-12-30


In [None]:
df.head()

Unnamed: 0,movieId,userId,rating,date
0,16242,2248080,3,1999-12-30
10,9189,2248080,3,1999-12-30
9,175,2248080,2,1999-12-30
8,14725,2248080,4,1999-12-30
7,10451,2248080,4,1999-12-30


In [None]:
df.shape

(1438653, 4)

In [None]:
# Checking for NaN values
print("No of Nan values in our dataframe : ", sum(df.isnull().any()))

No of Nan values in our dataframe :  0


In [None]:
start = datetime.now()

# Removing duplicates
dup_bool = df.duplicated(['movieId','userId','rating'])
dups = sum(dup_bool) 
print("There are {} duplicate rating entries in the data..".format(dups))

print(datetime.now() - start)

There are 0 duplicate rating entries in the data..
0:00:00.229023


In [None]:
print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",df.shape[0])
print("Total No of Users   :", len(np.unique(df.userId)))
print("Total No of movies  :", len(np.unique(df.movieId)))

Total data 
--------------------------------------------------

Total no of ratings : 1438653
Total No of Users   : 3679
Total No of movies  : 501


In [None]:
df.shape 

(1438653, 4)

In [None]:
movie_ids = df['movieId'].unique()
user_ids = df['userId'].unique()
unique_ratings = df['rating'].unique()

In [None]:
movie_ids.shape,user_ids.shape, list(unique_ratings)

((501,), (3679,), [3, 2, 4, 5, 1])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1438653 entries, 0 to 1438652
Data columns (total 4 columns):
 #   Column   Non-Null Count    Dtype         
---  ------   --------------    -----         
 0   movieId  1438653 non-null  int64         
 1   userId   1438653 non-null  int64         
 2   rating   1438653 non-null  int64         
 3   date     1438653 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 54.9 MB


In [None]:
print('Dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::20000, :])

Dataset shape: (1438653, 4)
-Dataset examples-
         movieId   userId  rating       date
0          16242  2248080       3 1999-12-30
20017       5169   769235       4 2000-09-07
39948       7617  2204336       3 2001-04-11
59925       6274  2446344       3 2001-09-26
79945       5939  1130863       1 2002-01-28
...          ...      ...     ...        ...
1339669     7767    40563       5 2005-08-13
1359092     2452  1002943       5 2005-09-01
1381161     6386   642036       3 2005-09-21
1400353    17405  1877143       3 2005-10-12
1420472     6994  1944765       4 2005-11-08

[72 rows x 4 columns]


## 2. Exploratory Data Analysis

### 2.1 Ratings Distribution

In [None]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

data = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

### 2.2 Ratings Distribution By Item

In [None]:
# Number of ratings per book
data = df.groupby('movieId')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Item (Clipped at 50)',
                   xaxis = dict(title = 'Number of Ratings Per Item'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
df.groupby('movieId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,movieId,rating
107,4306,3568
410,14691,3551
70,2862,3550
312,11283,3548
396,14410,3547
391,14312,3546
355,12918,3538
44,1905,3533
379,13728,3516
427,15124,3508


### 2.3 Ratings Distribution By User

In [None]:
# Number of ratings per user
data = df.groupby('userId')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [None]:
df.groupby('userId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,userId,rating
3623,2606799,501
526,387418,501
987,716173,501
3383,2439493,500
1851,1314869,499
2953,2118461,498
2867,2056022,498
2322,1664010,496
3103,2238060,493
2590,1852040,493


In [None]:
### Dimensionality

# To reduce the dimensionality of the dataset, we will filter out rarely rated movies and rarely rating users

# start = datetime.now()

# min_ratings = 5

# # min_ratings = 100000
# filter_items = df['movie'].value_counts() > min_ratings
# filter_items = filter_items[filter_items].index.tolist()

# min_user_ratings = 5
# # min_user_ratings = 1500
# filter_users = df['user'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# df_new = df[(df['movie'].isin(filter_items)) & (df['user'].isin(filter_users))]
# print('The original data frame shape:\t{}'.format(df.shape))
# print('The new data frame shape:\t{}'.format(df_new.shape))

# print(datetime.now() - start)

In [None]:
# df_new.head()

In [None]:
# train_df_new.drop(['date'], axis=1, inplace=True)

## 3. Surprise  - Machine Learning Models

To load a dataset from a pandas dataframe, we will use the load_from_df() method, we will also need a Reader object, and the rating_scale parameter must be specified. The dataframe must have three columns, corresponding to the user ids, the item ids, and the ratings in this order. Each row thus corresponds to a given rating.

In [None]:
start = datetime.now()

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

print(datetime.now() - start) 

0:00:01.157965


### Basic algorithms

With the Surprise library, we will benchmark the following algorithms



#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.

#### BaselineOnly

* BasiclineOnly algorithm predicts the baseline estimate for given user and item.

### k-NN algorithms

#### KNNBasic

* KNNBasic is a basic collaborative filtering algorithm.

#### KNNWithMeans

* KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

#### KNNWithZScore

* KNNWithZScore is a basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

#### KNNBaseline

* KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.

### Matrix Factorization-based algorithms

#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

#### NMF

* NMF is a collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.

### Slope One

* Slope One is a straightforward implementation of the SlopeOne algorithm. (https://arxiv.org/abs/cs/0702144)

### Co-clustering

* Co-clustering is a collaborative filtering algorithm based on co-clustering (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf)


We use **rmse** as our accuracy metric for the predictions.

In [None]:

start = datetime.now()

benchmark = []

algorithms = [SVD(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    
    # Cross validation
 
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=2, verbose=False, n_jobs=-1) 
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')
print(datetime.now() - start) 
# Note: all KNNs took lot of time, so can be removed for next training


# 
# Algorithm	test_rmse	fit_time	test_time
# SVD	0.825432	32.134573	4.175492

# Note the below KNN has taken lot of time, almost 9 mins each
# KNNWithZScore	0.863700	35.633523	548.962475
# KNNBaseline	0.865228	37.737368	492.150465
# KNNWithMeans	0.866616	35.509988	518.196755
# KNNBasic	0.875935	35.211515	508.918990

# CoClustering	0.887190	9.768482	3.339476
# NMF	0.902960	31.495054	3.513999
# SlopeOne	0.922336	3.597537	93.619539
# BaselineOnly	0.923342	0.331995	3.001491
# NormalPredictor	1.467154	0.665035	4.208984

# Total time taken: 0:06:23.374215

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x00000171CE1342B0>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x00000171CE134358>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x00000171CE134470>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x00000171CE1344A8>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x00000171CE1344E0>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x00000171CE134518>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x00000171CE1342B0>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x00000171CE1342B0> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x00000171CE134358>
Done:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x00000171CE134358> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.NMF object 

In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [None]:
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.826025,37.819979,4.662026
CoClustering,0.887227,9.237493,3.744503
NMF,0.895393,149.24963,4.016003
SlopeOne,0.922523,3.858996,95.521995
BaselineOnly,0.923101,0.314999,3.314993
NormalPredictor,1.468345,0.770229,4.552021


We see that SVD is among one of the best performing algorithms. Let's use SVD but apply GridSearchCV

Note: Removed SVDpp as it was taking lot of time to train.

In [None]:
start = datetime.now()

param_grid = {
    "n_epochs": [10, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=5, n_jobs=-1)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

print(datetime.now() - start) 


BEST RMSE: 	 0.7971916580994455
BEST MAE: 	 0.6169208265074673
BEST params: 	 {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
0:11:20.118924


In [None]:
from datetime import datetime
print(training_parameters)
reader = Reader(rating_scale=(1, 5))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

print("> OK")

print("> Creating trainset...")
trainset = data.build_full_trainset()
print("> OK")


startTraining = datetime.now()
print("> Training...")

algo = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It Took: ", (endTraining-startTraining).seconds, "seconds")

end = datetime.now()
print (">> DONE \t\t It Took", (end-start).seconds, "seconds" ) 


{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


		 STARTING


> Loading data...
> OK
> Creating trainset...
> OK
> Training...
> OK 		 It Took:  90 seconds
>> DONE 		 It Took 92 seconds


In [None]:
## Saving trained data
from surprise import dump
import os
model_filename = models_path + "/" + "Surprise_Model_SVD.pickle"
print (">> Starting dump")


# Dump algorithm and reload it.
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=algo)
print (">> Dump done")
print(model_filename)

>> Starting dump
>> Dump done
./models/Surprise_Model_SVD.pickle


## 4. Inference

### 4.1 Load Trained Model Utility Function

In [None]:
model_filename = models_path + "/" + "Surprise_Model_SVD.pickle"

In [None]:
## Load saved data
def load_model(model_filename):
    print (">> Loading dump")
    from surprise import dump
    import os
    file_name = os.path.expanduser(model_filename)
    _, loaded_model = dump.load(file_name)
    print (">> Loaded dump")
    return loaded_model

### 4.2 Recommend Movie for an User Id by Movie Rating Prediction Method

**Note: Here in Machine Leanring Models, we are not finding similar movies (which was the approach taken for Cosine Similarity methods)**. Approach taken here is movie rating prediction method

Using this rating prediction utility, I defined the following utility functions below for generating movie recommendations for a user.

In [None]:
movie_ids = df['movieId'].unique()
user_ids = df['userId'].unique()
unique_ratings = df['rating'].unique()

In [None]:
movie_ids.shape, user_ids.shape, unique_ratings

((501,), (3679,), array([3, 2, 4, 5, 1], dtype=int64))

In [None]:
# recommend movies for a user
# Predict Movie Rating for all the movies in the dataset for the user, if the prediction is more than a threshold, recommend the movie

def recommed_movies_by_rating_pred (user_id, movie_ids, movie_titles, model_filename, thresh = 4.0):

    movie_ids_subset = movie_ids    

    model = load_model(model_filename)

    recommended_movie_ids = []

    for movie_id in movie_ids_subset:

        review_prediction = model.predict(uid=user_id, iid=movie_id)
        rating = review_prediction.est

        if rating >= thresh:
            recommended_movie_ids.append(movie_id)

    recommended_movie_titles = []

    
    for mv_id in recommended_movie_ids:
        
        movie_title = movie_titles.loc[movie_titles.movieId==mv_id].values[0][2]
        movie_year = movie_titles.loc[movie_titles.movieId==mv_id].values[0][1]

        movie_details = '{} : {} ({})'.format(mv_id, movie_title, int(movie_year))
                                              
        recommended_movie_titles.append(movie_details)
        
    
    return recommended_movie_ids, recommended_movie_titles

In [None]:
user_id = '2248080'  
recommended_movie_ids, recommended_movie_titles = recommed_movies_by_rating_pred(user_id, movie_ids, movie_titles, model_filename)

>> Loading dump
>> Loaded dump


In [None]:
recommended_movie_titles

['14691 : The Matrix (1999)',
 '8596 : Seven (1995)',
 '7233 : Stand by Me (1986)',
 '9960 : Die Hard (1988)',
 '6428 : To Kill a Mockingbird (1962)',
 '5732 : GoodFellas: Special Edition (1990)',
 '6797 : The Breakfast Club (1985)',
 "8327 : One Flew Over the Cuckoo's Nest (1975)",
 '6974 : The Usual Suspects (1995)',
 "3605 : The Wizard of Oz: Collector's Edition (1939)",
 '2862 : The Silence of the Lambs (1991)',
 '17157 : Saving Private Ryan (1998)',
 '4306 : The Sixth Sense (1999)',
 '16377 : The Green Mile (1999)',
 '7193 : The Princess Bride (1987)',
 '13728 : Gladiator (2000)',
 '798 : Jaws (1975)',
 '2782 : Braveheart (1995)',
 '10042 : Raiders of the Lost Ark (1981)',
 '9628 : Star Wars: Episode VI: Return of the Jedi (1983)',
 '16265 : Star Wars: Episode IV: A New Hope (1977)',
 '5582 : Star Wars: Episode V: The Empire Strikes Back (1980)',
 '3290 : The Godfather (1974)',
 '12293 : The Godfather (1972)',
 '10820 : Back to the Future (1985)',
 '16954 : Indiana Jones and the L

In [None]:
print("Total Time taken :", datetime.now()-globalstart)


Total Time taken : 0:22:50.807649
