We are building a Recommender system for Movies from the 100KMovieLens dataset.

#  Obtain

In [None]:
import pandas as pd
ratings_df = pd.read_csv('./data/ratings.csv')
links_df = pd.read_csv('./data/links.csv')
movies_df = pd.read_csv('./data/movies.csv')
tags_df = pd.read_csv('./data/tags.csv')
ratings_df.info()
links_df.info()
movies_df.info()
tags_df.info()


In [None]:
# Drop unnecessary columns
ratings_df = ratings_df.drop(columns='timestamp')
# Drop unnecessary columns
tags_df = tags_df.drop(columns='timestamp')

#  Scrub

#  Explore

#  Model

Surprise is a scikit package for building and analysing recommender systems maintained by Nicolas Hug.

In [None]:
from surprise import Reader, Dataset
reader = Reader()
ratings_sp = Dataset.load_from_df(ratings_df,reader)

In [None]:
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, NMF, SlopeOne, CoClustering 
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, BaselineOnly
import numpy as np

### Cross Validate

In [None]:
cross_validate(SVD(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(NMF(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(SlopeOne(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(CoClustering(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(BaselineOnly(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### Gridsearch SVD

#### Gridsearch on SVD

In [38]:
params = {'n_factors': [10,25,50,75,100],
         'reg_all': [0.02, 0.04, 0.06, 0.08, 0.1]}
gs_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
gs_svd.fit(ratings_sp)

In [39]:
print(gs_svd.best_score)
print(gs_svd.best_params)

{'rmse': 0.8688002520258602, 'mae': 0.6680934755735739}
{'rmse': {'n_factors': 75, 'reg_all': 0.06}, 'mae': {'n_factors': 50, 'reg_all': 0.04}}


Based off these outputs, it seems like the best performing model is the SVD model with n_factors = 75 and a regularization rate of 0.06.

#### Gridsearch on BaselineOnly ALS

In [34]:
params = {'bsl_options': {'method': ['als'],
                              'reg_i': [8, 9, 10, 11, 12], # lambda 2
                              'reg_u': [3, 4, 5, 6, 7],# lambda 3
                             }
         }
gs_bo = GridSearchCV(BaselineOnly,param_grid=params,n_jobs=-1)
gs_bo.fit(ratings_sp)

In [35]:
print(gs_bo.best_score)
print(gs_bo.best_params)

{'rmse': 0.8699094486127269, 'mae': 0.6693305255443935}
{'rmse': {'bsl_options': {'method': 'als', 'reg_i': 8, 'reg_u': 4}}, 'mae': {'bsl_options': {'method': 'als', 'reg_i': 8, 'reg_u': 3}}}


### Recommendations

In [41]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
dataset = ratings_sp.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [45]:
svd = SVD(n_factors= 75, reg_all=0.06)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25ad273e3d0>

In [60]:
#predict for first user and first movie
#last user first movie
svd.predict(1, 1)


Prediction(uid=1, iid=1, r_ui=None, est=4.598180853475434, details={'was_impossible': False})

In [59]:
### last user first movie
svd.predict(610, 1)


Prediction(uid=610, iid=1, r_ui=None, est=4.057444096106535, details={'was_impossible': False})

In [53]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [102]:
def new_interview(userID, num):
    fav_genres = []
    while num > 0 :
        question1 = input('Brains or Braun :\n')
        q2 = input('Hero or Braun :\n')
        if questions == 'q':
            continue
        else:
            fav_genres.append(questions)
            num -= 1
    return questions


In [103]:
new_interview(1000, 3)


Brains or Braun :
Brains
Brains or Braun :
Brains
Brains or Braun :
Brains


'Brains'

In [61]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list    

In [84]:
user_rating = movie_rater(movies_df, 4)

      movieId                             title       genres
9625   178129  Adventures in Plymptoons! (2011)  Documentary
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                               title     genres
4590     6818  Come and See (Idi i smotri) (1985)  Drama|War
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                  title                genres
8282   105755  Counselor, The (2013)  Crime|Drama|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId            title         genres
4347     6349  Breakin' (1984)  Drama|Musical
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId          title                  genres
8121   101088  Stoker (2013)  Drama|Mystery|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                title          g

In [85]:
## add the new ratings to the original ratings DataFrame
delta_ratings_df = ratings_df.append(user_rating,ignore_index=True)
delta_rating_sp = Dataset.load_from_df(delta_ratings_df,reader)

In [86]:
# train a model using the new combined DataFrame
svd2 = SVD(n_factors= 75, reg_all=0.06)
svd2.fit(delta_rating_sp.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25ad1319f10>

In [87]:
# make predictions for the user
#in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in delta_ratings_df['movieId'].unique():
    list_of_movies.append( (m_id, svd2.predict(1000, m_id)[3]))

In [88]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [65]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            


In [82]:
#User id 1000
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  2 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  3 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation #  4 :  596    Ghost in the Shell (Kôkaku kidôtai) (1995)
Name: title, dtype: object 

Recommendation #  5 :  1730    Life Is Beautiful (La Vita è bella) (1997)
Name: title, dtype: object 



In [89]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  686    Rear Window (1954)
Name: title, dtype: object 

Recommendation #  2 :  841    Streetcar Named Desire, A (1951)
Name: title, dtype: object 

Recommendation #  3 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation #  4 :  585    Wallace & Gromit: The Best of Aardman Animatio...
Name: title, dtype: object 

Recommendation #  5 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 



In [None]:
Wizard for cold start
What are you in the mood for?
Brains or Braun?



In [90]:
delta_ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4
1,1,3,4
2,1,6,4
3,1,47,5
4,1,50,5
...,...,...,...
100835,610,170875,3
100836,1000,4207,3
100837,1000,60766,5
100838,1000,151557,2


# Interpret