In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import scale

## Load Data

In [2]:
df_movies = pd.read_csv('./movies_metadata_clean.csv', encoding='ISO-8859-1')
df_users = pd.read_csv('./movies_ratings_small.csv', encoding='ISO-8859-1')

## Build recommendations

1. content based filtering

In [3]:
df_movies.shape

# Remove duplicates
df_movies.drop_duplicates(subset='title',keep='first',inplace=True)
df_movies.shape

(45463, 24)

(42277, 24)

In [4]:
# Prepare description column

df_movies['tagline'] = df_movies['tagline'].fillna('')
df_movies['description'] = df_movies['overview'] + df_movies['tagline']
df_movies['description'] = df_movies['description'].fillna('')
df_movies['description']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
45456    It's the year 3000 AD. The world's most danger...
45458    Rising and falling between a man and woman.Ris...
45459    An artist struggles to finish his work while a...
45461    In a small town live two brothers, one a minis...
45462    50 years after decriminalisation of homosexual...
Name: description, Length: 42277, dtype: object

## Build model

In [5]:
## Generate a matrix of common terms that show up in each movie

from sklearn.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')

tfidf_matrix = model.fit_transform(df_movies['description'])
tfidf_matrix.shape

(42277, 1047434)

In [6]:
# Calculate the cosine similarity between each pair of movies

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim.shape

(42277, 42277)

## Generate Recommendation

In [7]:
titles = df_movies['title']
indicies = pd.Series(df_movies.index, index=df_movies['title'])

def get_recommendations(title):
    idx = indicies[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indicies = [i[0] for i in sim_scores]
    return titles.iloc[movie_indicies]

In [8]:
get_recommendations('The Godfather')

44027    The Godfather Trilogy: 1972-1990
1178               The Godfather: Part II
31971                    Honor Thy Father
23125                          Blood Ties
38027            A Mother Should Be Loved
18322                     The Outside Man
11297                    Household Saints
4324                                 Made
5433                   Johnny Dangerously
18224                           Miss Bala
Name: title, dtype: object

In [9]:
get_recommendations('Jumanji')

21632                  Table No. 21
9503                      Word Wars
43124                       The Bar
8801                        Quintet
17223                The Dark Angel
37444            The Ouija Exorcism
15512               Le Pont du Nord
34771    Doctor Who: Last Christmas
44373             Liar Game: Reborn
35507                      The Mend
Name: title, dtype: object

2. Collaborative Filtering

In [10]:
df_users.tail(10)

Unnamed: 0,userId,movieId,rating,timestamp
99994,671,5952,5.0,1063502716
99995,671,5989,4.0,1064890625
99996,671,5991,4.5,1064245387
99997,671,5995,4.0,1066793014
99998,671,6212,2.5,1065149436
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [11]:
# Prepare data into Surprise Library format

from surprise import Dataset
from surprise import Reader

# reader = Reader(rating_scale=(1,5))
reader = Reader()
x = Dataset.load_from_df(df_users[['userId','movieId', 'rating']], reader)


In [12]:
from surprise.model_selection import train_test_split

x_train, x_test = train_test_split(x, test_size=.25)

In [13]:
# Define SVD model

from surprise import SVD
model_svd_rating = SVD()

In [14]:
# Fit SVD model

model_svd_rating.fit(x_train)
tests_pred = model_svd_rating.test(x_test)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7efb90ea0160>

In [15]:
# Evaluate SVD accuracy

from surprise import accuracy

accuracy.rmse(tests_pred)

RMSE: 0.9020


0.9020499585828412

In [16]:
## Cross-validate

from surprise.model_selection import cross_validate

cross_validate(model_svd_rating, x, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8967  0.8971  0.8921  0.8993  0.8942  0.8959  0.0025  
MAE (testset)     0.6924  0.6901  0.6853  0.6938  0.6888  0.6901  0.0030  
Fit time          3.35    3.36    3.35    3.38    3.34    3.36    0.01    
Test time         0.10    0.10    0.17    0.10    0.17    0.13    0.04    


{'test_rmse': array([0.89671871, 0.8971485 , 0.8920674 , 0.89929983, 0.89418423]),
 'test_mae': array([0.6924476 , 0.69010523, 0.68534739, 0.69382816, 0.68880495]),
 'fit_time': (3.352205514907837,
  3.360137701034546,
  3.3495917320251465,
  3.3806724548339844,
  3.3363964557647705),
 'test_time': (0.09629416465759277,
  0.0961756706237793,
  0.16896891593933105,
  0.09579730033874512,
  0.16830205917358398)}

In [17]:
# Predict rating for new movie

df_users[df_users['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [18]:
# Predict what user 1 would have rated movie 302

model_svd_rating.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.6219036809851892, details={'was_impossible': False})