We are building a Recommender system for Movies from the 100KMovieLens dataset.

#  Obtain

In [None]:
import pandas as pd
ratings_df = pd.read_csv('./data/ratings.csv')
links_df = pd.read_csv('./data/links.csv')
movies_df = pd.read_csv('./data/movies.csv')
tags_df = pd.read_csv('./data/tags.csv')
ratings_df.info()
links_df.info()
movies_df.info()
tags_df.info()


In [None]:
# Drop unnecessary columns
ratings_df = ratings_df.drop(columns='timestamp')
# Drop unnecessary columns
tags_df = tags_df.drop(columns='timestamp')

#  Scrub

#  Explore

#  Model

Surprise is a scikit package for building and analysing recommender systems maintained by Nicolas Hug.

In [None]:
from surprise import Reader, Dataset
reader = Reader()
ratings_sp = Dataset.load_from_df(ratings_df,reader)

In [None]:
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, NMF, SlopeOne, CoClustering 
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, BaselineOnly
import numpy as np

### Cross Validate

In [None]:
cross_validate(SVD(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(NMF(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(SlopeOne(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(CoClustering(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(BaselineOnly(), ratings_sp, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### Gridsearch SVD

#### Gridsearch on SVD

In [38]:
params = {'n_factors': [10,25,50,75,100],
         'reg_all': [0.02, 0.04, 0.06, 0.08, 0.1]}
gs_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
gs_svd.fit(ratings_sp)

In [39]:
print(gs_svd.best_score)
print(gs_svd.best_params)

{'rmse': 0.8688002520258602, 'mae': 0.6680934755735739}
{'rmse': {'n_factors': 75, 'reg_all': 0.06}, 'mae': {'n_factors': 50, 'reg_all': 0.04}}


Based off these outputs, it seems like the best performing model is the SVD model with n_factors = 75 and a regularization rate of 0.06.

#### Gridsearch on BaselineOnly ALS

In [34]:
params = {'bsl_options': {'method': ['als'],
                              'reg_i': [8, 9, 10, 11, 12], # lambda 2
                              'reg_u': [3, 4, 5, 6, 7],# lambda 3
                             }
         }
gs_bo = GridSearchCV(BaselineOnly,param_grid=params,n_jobs=-1)
gs_bo.fit(ratings_sp)

In [35]:
print(gs_bo.best_score)
print(gs_bo.best_params)

{'rmse': 0.8699094486127269, 'mae': 0.6693305255443935}
{'rmse': {'bsl_options': {'method': 'als', 'reg_i': 8, 'reg_u': 4}}, 'mae': {'bsl_options': {'method': 'als', 'reg_i': 8, 'reg_u': 3}}}


### Recommendations

In [201]:
movies_df.head(50)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [42]:
dataset = ratings_sp.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


In [45]:
svd = SVD(n_factors= 75, reg_all=0.06)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25ad273e3d0>

In [60]:
#predict for first user and first movie
#last user first movie
svd.predict(1, 1)


Prediction(uid=1, iid=1, r_ui=None, est=4.598180853475434, details={'was_impossible': False})

In [59]:
### last user first movie
svd.predict(610, 1)


Prediction(uid=610, iid=1, r_ui=None, est=4.057444096106535, details={'was_impossible': False})

In [53]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [204]:
def new_interview(userID):
    genIds = []
    genres = { '1' :'Action' , '2' :'Adventure' , '3' :'Drama', '4' :'Comedy' , '5' :'Horror' , '6' :'Sci-Fi'}
    top3 = []
    num = 1
    print(genres)
    while num > 0 :
        
        g1, g2, g3 = input('Please enter your top 3 genres from below.  IE 2 3 5 :\n').split()


        genIds.append(g1)
        genIds.append(g2)
        genIds.append(g3)

        num -= 1

    for gid in genIds:
        top3.append(genres.get(gid))
        
    return top3


In [217]:
def rate_driver(userID):
    genres = { 1 :'Action' , 2 :'Adventure' ,3 :'Drama', 4 :'Comedy' , 5:'Horror' , 6:'Sci-Fi'}
    topgenres = new_interview(userID)
    init_user_rates = []
    
    for genre in topgenres:
        #rate 3 movies for the genre
        init_user_rates.append(movie_rater(userID, 3, genre))
        
    return init_user_rates
            

In [218]:
def movie_rater(movie_df, num, genre=None):
    userID = 1000
    rating_list = []
    print(genre)
    while num > 0:
        print('##################################################################')
        if genre:
            movie = movies_df[movies_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movies_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list    

In [214]:
user_rating = rate_driver(1000)

{'1': 'Action', '2': 'Adventure', '3': 'Drama', '4': 'Comedy', '5': 'Horror', '6': 'Sci-Fi'}
Please enter your top 3 genres from below.  IE 2 3 5 :
1 3 5
Action
###########################################
      movieId                                     title            genres
9154   147662  Return of the One-Armed Swordsman (1969)  Action|Adventure
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
###########################################
      movieId                              title                  genres
6102    42559  Samurai Assassin (Samurai) (1965)  Action|Adventure|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
###########################################
      movieId                           title  \
6879    62956  Futurama: Bender's Game (2008)   

                                                genres  
6879  Action|Adventure|Animation|Comedy|Fantasy|Sci-Fi  
How do you rate this movie on a scale of 1

In [None]:
puser_rating = movie_rater(movies_df, 3, 'Action')

Action
##################################################################
      movieId                       title                          genres
7051    69275  Dead Snow (Død snø) (2009)  Action|Adventure|Comedy|Horror
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
##################################################################
      movieId                           title  \
4639     6934  Matrix Revolutions, The (2003)   

                                     genres  
4639  Action|Adventure|Sci-Fi|Thriller|IMAX  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
##################################################################
      movieId                                              title  \
8683   122886  Star Wars: Episode VII - The Force Awakens (2015)   

                                    genres  
8683  Action|Adventure|Fantasy|Sci-Fi|IMAX  


In [220]:
## add the new ratings to the original ratings DataFrame
delta_ratings_df = ratings_df.append(user_rating,ignore_index=True)
delta_rating_sp = Dataset.load_from_df(delta_ratings_df,reader)

ValueError: too many values to unpack (expected 3)

In [207]:
# train a model using the new combined DataFrame
svd2 = SVD(n_factors= 75, reg_all=0.06)
svd2.fit(delta_rating_sp.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25ad60689d0>

In [208]:
# make predictions for the user
#in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in delta_ratings_df['movieId'].unique():
    list_of_movies.append( (m_id, svd2.predict(1000, m_id)[3]))

In [209]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [65]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            


In [210]:
#User id 1000
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  2 :  686    Rear Window (1954)
Name: title, dtype: object 

Recommendation #  3 :  4909    Eternal Sunshine of the Spotless Mind (2004)
Name: title, dtype: object 

Recommendation #  4 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  5 :  596    Ghost in the Shell (Kôkaku kidôtai) (1995)
Name: title, dtype: object 



In [212]:
recommended_movies(ranked_movies,movies_df,5)

Recommendation #  1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation #  2 :  686    Rear Window (1954)
Name: title, dtype: object 

Recommendation #  3 :  4909    Eternal Sunshine of the Spotless Mind (2004)
Name: title, dtype: object 

Recommendation #  4 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation #  5 :  596    Ghost in the Shell (Kôkaku kidôtai) (1995)
Name: title, dtype: object 



In [None]:
Wizard for cold start
What are you in the mood for?
Brains or Braun?



In [213]:
delta_ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4
1,1,3,4
2,1,6,4
3,1,47,5
4,1,50,5
...,...,...,...
100834,610,168252,5
100835,610,170875,3
100836,1000,157865,3
100837,1000,4749,1


# Interpret