# **Team 16 - EDSA movie recommendation wilderness**

In [None]:
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "https://storage.googleapis.com/kaggle-competitions/kaggle/33594/logos/header.png?t=2022-01-04-10-31-44")

#### Installations

In [None]:
"""!pip install scikit-plot
!pip install comet_ml"""

#### Start Comet Experiment

In [None]:
"""#Uncomment the below to create/start a new experiment on Comet

# Start Comet Experiment
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment(
    api_key="0PXFIXjRITl64S9L8bH64cfYP",
    project_name="unsupervised-learning-predict",
    workspace="henriedwards",
)"""

<a id="cont"></a>

### Table of Contents

<a href=#one>1. Problem Statement</a>

<a href=#two>2. Importing Libraries</a>

<a href=#three>3. Loading Data</a>

<a href=#four>4. Data Preprocessing</a>

<a href=#five>5. Exploratory Data Analysis</a>

<a href=#six>6. Content Based Filtering</a>

<a href=#sevem>7. Collaborative Filtering</a>

<a href=#eight>8. Model Evaluation</a>

<a href=#nine>9. Hyperparameter Tuning</a>

<a href=#ten>10. Conclusion</a>

<a href=#eleven>11. Submission</a>

<a id="one"></a>
## 1. Problem Statement

Construct a recommendation algorithm based on content or collaborative filtering, capable of accurately predicting how a user will rate a movie they have not yet viewed based on their historical preferences. To achieve this, we will contruct a recommendation system.

Recommendation Systems we will be exploring:

 - Collaborative Based Filtering
 - Content Based Filtering

<a id="two"></a>
## 2. Importing Libraries

In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Kaggle requirements
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data analysis libraries
import pandas as pd
import numpy as np        
        
# ML Pre processing
import re
import time
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from numpy.random import RandomState
from sklearn.feature_extraction.text import TfidfVectorizer

# Visualisation libraries
from matplotlib import pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
%matplotlib inline
import wordcloud
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
sns.set()

# Content Filtering Models
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Suprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split

# Collaborative Filtering
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering

# Evaluation
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise.accuracy import rmse
from surprise import accuracy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Hyperparameter tuning
from surprise.model_selection import GridSearchCV

<a id="three"></a>
## 3. Loading Data

In [None]:
movies = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/movies.csv")
movies_og = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/movies.csv")
imdb_data = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/imdb_data.csv")
genome_tags = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/genome_tags.csv")
genome_scores = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/genome_scores.csv")
tags = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/tags.csv")
links = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/links.csv")
train_data = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/train.csv")
test_data = pd.read_csv("/kaggle/input/edsa-movie-recommendation-wilderness/test.csv")

<a id="four"></a>
## 4. Data Preprocessing

**Data preview**

In [None]:
movies.head(3)

In [None]:
imdb_data.head(3)

In [None]:
genome_tags.head(3)

In [None]:
genome_scores.head(3)

In [None]:
tags.head(3)

In [None]:
links.head(3)

In [None]:
train_data.head(3)

In [None]:
# check duplicates
dup_bool = train_data.duplicated(['userId', 'movieId', 'rating', 'timestamp'])

# display duplicates
print("Number of duplicate records:", sum(dup_bool))

In [None]:
# fill NaN values with usable nulls

imdb_data['director'] = imdb_data['director'].fillna('')
imdb_data['runtime'] = imdb_data['runtime'].fillna('')
imdb_data['budget'] = imdb_data['budget'].fillna('')
imdb_data['title_cast'] = imdb_data['title_cast'].fillna('')
imdb_data['plot_keywords'] = imdb_data['plot_keywords'].fillna('')

In [None]:
# remove string seperators and add to a list

imdb_data['plot_keywords'] = imdb_data['plot_keywords'].str.split('|')
imdb_data['title_cast'] = imdb_data['title_cast'].str.split('|')
movies['genres'] = movies['genres'].str.split('|')

In [None]:
# extract year from title into own feature

movies['year'] = movies['title'].str.extract('(\d{4})')

In [None]:
def year_remover(text):
    
    """Function that takes in a string, and removes the year in parenthesis"""
    
    text = re.sub(r'\([^)]*\)','',text)
    return text

movies['title'] = movies['title'].map(year_remover)

In [None]:
movies.head(3)

<a id="five"></a>
## 5. Exploratory Data Analysis

> ### Movies

In [None]:
plt.figure(figsize=(6,10))
sns.set(style="white")
ax = sns.countplot(y="year", data=movies, palette="Set2", order=movies['year'].value_counts().index[0:10])

> ### Genres

In [None]:
# Plot Distribution of Genres

plt.subplots(figsize=(12,10))
list1 = []
sns.set(style='whitegrid', palette='flare',
        rc={'figure.figsize': (6,10)})

for i in movies['genres']:
    list1.extend(i)
ax = pd.Series(list1).value_counts()[:10].sort_values(ascending=True).plot.barh(width=0.9,color=sns.color_palette('flare',10))
for i, v in enumerate(pd.Series(list1).value_counts()[:10].sort_values(ascending=True).values): 
    ax.text(250, i-.10, v,fontsize=12,color='white',weight='bold')
plt.title('Top Genres')
plt.show()

In [None]:
#word cloud for genre
all_words = "".join([str(i) for i in movies['genres']])
wordcloud = WordCloud(width=1000, height=500, random_state=21, max_font_size=200, max_words=100).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('GENRES')
plt.show()

> ### Actors

In [None]:
# Plot Distribution of Actors

plt.subplots(figsize=(6,10))
list1=[]
for i in imdb_data['title_cast']:
    list1.extend(i)
ax=pd.Series(list1).value_counts()[1:11].sort_values(ascending=True).plot.barh(width=0.9,color=sns.color_palette('flare_r',10))
for i, v in enumerate(pd.Series(list1).value_counts()[1:11].sort_values(ascending=True).values): 
    ax.text(.8, i, v,fontsize=12,color='white',weight='bold')
plt.title('Actors with highest appearance')
plt.show()

###    Cast

In [None]:
from PIL import Image
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
#word cloud for key words that people use while searching
all_words = ''.join([str(i) for i in imdb_data['title_cast']])
wordcloud = WordCloud(width=1000, height=500, random_state=25, max_font_size=250, max_words=100).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('popular cast')
plt.show()

> ### Directors

In [None]:
# Plot Distribution of Directors

def xstr(s):
    if s is None:
        return ''
    return str(s)
imdb_data['director'] = imdb_data['director'].apply(xstr)
plt.subplots(figsize=(6,10))
ax = imdb_data[imdb_data['director']!=''].director.value_counts()[2:12].sort_values(ascending=True).plot.barh(width=0.9,color=sns.color_palette('flare',10))
for i, v in enumerate(imdb_data[imdb_data['director']!=''].director.value_counts()[2:12].sort_values(ascending=True).values): 
    ax.text(.5, i, v,fontsize=12,color='white',weight='bold')
plt.title('Directors with highest movies')
plt.show()

In [None]:

def xstr(s):
    if s is None:
        return ''
    return str(s)
imdb_data['budget'] = imdb_data['budget'].apply(xstr)
plt.subplots(figsize=(6,10))
ax = imdb_data[imdb_data['budget']!=''].budget.value_counts()[2:12].sort_values(ascending=True).plot.barh(width=0.9,color=sns.color_palette('gist_earth',10))
for i, v in enumerate(imdb_data[imdb_data['budget']!=''].budget.value_counts()[2:12].sort_values(ascending=True).values): 
    ax.text(.5, i, v,fontsize=12,color='white',weight='bold')
plt.title('Movie budget')
plt.show()

> ### Plot keywords

In [None]:
# Create wordcloud of plot_keywords

cloud = ''
for i in imdb_data['plot_keywords']:
    for e in i:
        cloud += ' '+ str(e)

> ### Ratings

In [None]:
print (f'Number of ratings in dataset: {train_data.shape[0]}')

In [None]:
# Plot Distribution of Ratings

with sns.axes_style('white'):
    g = sns.factorplot('rating', data=train_data, aspect=2.0,kind='count',palette="rocket_r")
    g.set_ylabels('Total number of ratings')
    plt.title("User rating distribution")
print (f'Average rating in dataset: {np.mean(train_data["rating"])}')

In [None]:
#create a dataframe to count how many times each user has rated a movie
user_id = pd.DataFrame(train_data.groupby('userId')['rating'].mean())
user_id['total number of ratings'] = pd.DataFrame(train_data.groupby('userId')['rating'].count())
user_id.head(10)

In [None]:
#Create new dataframe
user_id = pd.DataFrame(train_data.groupby('userId')['rating'].mean())
user_id['total number of ratings'] = pd.DataFrame(train_data.groupby('userId')['rating'].count())
# sort dataframe by total number of ratings
user_id.sort_values(by=['total number of ratings'], inplace=True, ascending=False)
# reset the index
user_id.reset_index(inplace=True)

In [None]:
#total number of ratings per user id
sns.barplot(y="total number of ratings", x="userId", data=user_id.head(10), 
 order = user_id.head(10).sort_values('total number of ratings', ascending=False).userId, palette="tab20")
plt.xticks(rotation=90)

### **Joining dataFrames**

The data was provided in multiple csv files, the analysis of the data will be simpler and more efficiently carried out if the multiple files are joined to form one dataframe.

In [None]:
joint_df = train_data.merge(movies,on='movieId').merge(imdb_data,on='movieId')
joint_df.head(3)

In [None]:
#Create new dataframe
user_id = pd.DataFrame(joint_df.groupby('userId')['rating'].mean())
user_id['total number of ratings'] = pd.DataFrame(joint_df.groupby('userId')['rating'].count())
# sort dataframe by total number of ratings
user_id.sort_values(by=['total number of ratings'], inplace=True, ascending=False)
# reset the index
user_id.reset_index(inplace=True)

In [None]:
# remove commas
joint_df['budget'] = joint_df['budget'].astype(str).str.replace(',', '')

# remove currency signs like "$" and "GBP"
joint_df['budget'] = joint_df['budget'].str.extract('(\d+)', expand=False)

# convert the feature into a float
joint_df['budget'] = joint_df['budget'].astype(float)

# remove nan values and replacing with 0
joint_df['budget'] = joint_df['budget'].replace(np.nan, 0)

# convert the feature into an integer
joint_df['budget'] = joint_df['budget'].astype(int)

In [None]:
#Top budgets  
plt.figure(figsize = (20,5))
budget=joint_df['budget'].explode()
ax=sns.countplot(x=budget, order = budget.value_counts().index[1:20], palette="tab20b")
ax.set_title('Top budget',fontsize=30)
plt.xticks(rotation=90)
plt.show()

In [None]:
def correlation_matrix( df ):
    corr = df.corr()
    figure , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 20 , as_cmap = True )
    figure = sns.heatmap(corr,cmap = cmap,square=True, cbar_kws={ 'shrink' : .9 }, ax=ax, annot = True, annot_kws = { 'fontsize' : 12 })

In [None]:
# Select a number of features from the dataframe to make the correlation map
joint1_df=joint_df
correlation_matrix(joint_df[['userId','movieId','rating', 'timestamp', 'budget','runtime']])
plt.title("Correlation matrix")

In [None]:
joint1_df['runtime'].describe()

In [None]:
#convert runtime to numeric to be able to plot the graph
joint1_df['runtime']=pd.to_numeric(joint_df['runtime'])
plt.figure(figsize=(12,6))
sns.distplot(joint1_df[(joint1_df['runtime'] < 300) & (joint1_df['runtime'] > 0)]['runtime'])
plt.title("Runtime distribution graph")


In [None]:
#create a new dataframe
new1 = pd.DataFrame(joint_df.groupby('title')['rating'].mean())
new1['total number of ratings'] = pd.DataFrame(joint_df.groupby('title')['rating'].count())
# sort dataframe by total number of ratings
new1.sort_values(by=['total number of ratings'], inplace=True, ascending=False)
# reset the index
new1.reset_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))

sns.barplot(x="total number of ratings", y="title", data=new1.head(10), palette='crest_r')

In [None]:
joint_df.info()

In [None]:
joint_df.isnull().sum()

In [None]:
joint_df['genres'] = [','.join(map(str, l)) for l in joint_df['genres']]
joint_df['title_cast'] = [','.join(map(str, l)) for l in joint_df['title_cast']]
joint_df['plot_keywords'] = [','.join(map(str, l)) for l in joint_df['plot_keywords']]
joint_df.head()

In [None]:
# replace each plot_keywords observation, with only the first 2 words in the plot_keywords column
joint_df['plot_keywords'] = joint_df['plot_keywords'].str.extract('(^\w+\W\w+)')
# Then replace NaN values again with nulls a computer understands
joint_df['plot_keywords'] = joint_df['plot_keywords'].fillna('')

In [None]:
# replace each cast observation, with only the first 2 cast members
joint_df['title_cast'] = joint_df['title_cast'].str.extract('(^\w+\W\w+\W\w+\W\w+)')
# Then replace NaN values again with nulls a computer understands
joint_df['title_cast'] = joint_df['title_cast'].fillna('')

In [None]:
joint_df.head(3)

<a id="six"></a>
## 6. Content Based Filtering
The content-based approach uses additional information about users and/or items. This filtering method uses item features to recommend other items similar to what the user likes and also based on their previous actions or explicit feedback.

We will use a content-based to make movie recommendations based on movie title input

- Cosine Similarity
- n-Recommendations

In [None]:
# Concatenate columns into single column for vectorization
joint_df['combined'] = joint_df['year'] + joint_df['genres'] + joint_df['plot_keywords']

In [None]:
content_df = joint_df.merge(movies_og,on='movieId')
joint_df.head(3)

In [None]:
content_df = content_df.sample(n=35000)

content_df.set_index('movieId', inplace = True)

content_df.drop(['userId','rating','timestamp', 'genres_x','year', 'title_cast','director','runtime','budget','plot_keywords', 'genres_y', 'title_x'],axis=1,inplace=True)

content_df.head(3)

In [None]:
def combined_cleaner(df, columns):
    for i in columns:
        df[i] = df[i].str.replace(',', '') # remove commas
        df[i] = df[i].str.lower() # lower case
        df[i] = df[i].str.replace(' ', '') # remove spaces
    return df

combined_cleaner(content_df, ['combined'])
content_df.head(3)

In [None]:
titles = content_df['title_y']
indices = pd.Series(content_df.index, index=content_df['title_y'])

In [None]:
titles

In [None]:
# Vectorize the new column containing genres, year, plot keywords
cv = CountVectorizer()
cv_matrix = cv.fit_transform(content_df['combined'])

In [None]:
# Cosine Similarity Matrix
cos_sim = cosine_similarity(cv_matrix, cv_matrix)

In [None]:
def content_generate_top_N_recommendations(movie_title, N=10):
    idx=[]
    for i in movie_title:
        idx.append(indices[i])
    
    sim_scores1 = list(enumerate(cos_sim[idx[0]]))
    #sim_scores2 = list(enumerate(cos_sim[idx[1]]))
    #sim_scores3 = list(enumerate(cos_sim[idx[2]]))
    sim_scores = sim_scores1# + sim_scores2 + sim_scores3
    
    # Select the top-N values for recommendation
    sim_scores = sim_scores[1:N*2]
    movie_indices = [i[0] for i in sim_scores]
    movie_indices = np.setdiff1d(movie_indices, idx)
          
    # Convert the indexes back into titles 
    return titles.iloc[movie_indices[:10]]

In [None]:
# Generate 10 recommendations - Change value of N to change recommendations
content_generate_top_N_recommendations(['2001: A Space Odyssey (1968)'], N=10)

<a id="seven"></a>
## 7. Collaborative Filtering
The Collaborative filtering method for recommender systems is a method that is solely based on the past interactions that have been recorded between users and items, in order to produce new recommendations. In essence, Collaborative Filtering looks at similarities between users.
- Matrix Factorization-based algorithms
    - Singular Value Decomposition (SVD)
    - Non-negative matrix factorization (NMF)
    - Slope One
    - Co-clustering
- k-Nearest Neighbors (KNN)
    - KNNBasic
    - KNNWithZScore
   
#### Data Used in Collaborative Filtering:
 - train data (for model training and evaluation)
 - test data (for submission)
 
We will we training a sample of our data on numerous models and comparing training time & RMSE score. We will then choose the best model and move to hyperparameter tuning. Hyperparameter Tuning will be used in collaboration with Comet, where we can view past results and the hyperparameters used, then proceed to choose the best hyperparameters.

In [None]:
# Loading as Surprise dataframe
df_train = train_data.copy()

# Filter out movies that were rated less than or equal to 100
filter_movies = df_train['movieId'].value_counts() > 100
filter_movies = filter_movies[filter_movies].index.tolist()
df_new = df_train[(df_train['movieId'].isin(filter_movies))]

df_train = df_new.sample(n=5000) # remove this text to run on full data - Not advised, rather set n values higher. Max 5 000 000

reader = Reader()
# Data selected for model training
data = Dataset.load_from_df(df_train[["userId", "movieId", "rating"]], reader)

Due to computation limitations, we will be training all models on only a sample dataset, and the best performing model will be used in hyperparameter tuning & will then be trained on the full dataset.

In [None]:
trainset, testset = train_test_split(data, test_size=0.01, random_state = 42)

#### Singular Value Decomposition (SVD)
SVD is the most common method used in collaborative filtering recommender systems, it is a linear algebra factorization technique that decomposes a matrix into three matrices. SVD shrinks space dimensions from n-dimension to k-dimension (where k<n), thus reducing the number of features.

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the SVD model using the train_split set
svd = SVD()
svd.fit(trainset)

#Making predictions using the fitted model
svd_pred = svd.test(testset) 

#Checking the root mean squared error to get an idea of how well the model performs
svd_rmse = rmse(svd_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
svd_time = round(((time.time() - modeltime)),2)

#### Non-negative matrix factorization (NMF)

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the NMF model using the train_split set
nmf = NMF()
nmf.fit(trainset)

#Making predictions using the fitted model
nmf_pred = nmf.test(testset)

#Checking the root mean squared error to get an idea of how well the model performs
nmf_rmse = rmse(nmf_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
nmf_time = round(((time.time() - modeltime)),2)

#### Slope One

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the SlopeOne model using the train_split set
slope = SlopeOne()
slope.fit(trainset)

#Making predictions using the fitted model
slope_pred = slope.test(testset)

#Checking the root mean squared error to get an idea of how well the model performs
slope_rmse = rmse(slope_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
slope_time = round(((time.time() - modeltime)),2)

#### Co-clustering

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the CoClustering model using the train_split set
cc = CoClustering()
cc.fit(trainset)

#Making predictions using the fitted model
cc_pred = cc.test(testset)

#Checking the root mean squared error to get an idea of how well the model performs
cc_rmse = rmse(cc_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
cc_time = round(((time.time() - modeltime)),2)

#### KNNBasic

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the KNNBasic model using the train_split set
knnb = KNNBasic()
knnb.fit(trainset)

#Making predictions using the fitted model
knnb_pred = knnb.test(testset) 

#Checking the root mean squared error to get an idea of how well the model performs
knnb_rmse = rmse(knnb_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
knnb_time = round(((time.time() - modeltime)),2)

#### KNNWithZScore

In [None]:
#Save model traning time
modeltime = time.time()

#Selecting and fitting the KNNWithMeans model using the train_split set
knnm = KNNWithMeans()
knnm.fit(trainset)

#Making predictions using the fitted model
knnm_pred = knnm.test(testset) 

#Checking the root mean squared error to get an idea of how well the model performs
knnm_rmse = rmse(knnm_pred)

#View model training time & rmse
print("Model Runtime: %0.2f seconds"%((time.time() - modeltime)))
knnm_time = round(((time.time() - modeltime)),2)

<a id="eight"></a>
## 8. Evaluating Model Performances
 - Select best model for Hyperparameter Tuning and Submission

In [None]:
# Compare RMSE Between Models

fig,axis = plt.subplots(figsize=(11, 2))
rmse_y = ['SVD','NMF','Slope One', 'Co-Clustering', 'KNNBasic', 'KNNWithMeans']
rmse_x = [svd_rmse,nmf_rmse,slope_rmse, cc_rmse, knnb_rmse, knnm_rmse]
ax = sns.barplot(y=rmse_y, x=rmse_x,palette='flare')
plt.title('Model RMSE Compared',fontsize=13)
plt.xlabel('RMSE', size=13)
plt.yticks(size=12)
plt.xticks(size=12)
for i in ax.patches:
    ax.text(i.get_x() + i.get_height()*1.5, i.get_y() + i.get_height(), round(i.get_width(),2), fontsize=12, ha="left", va='bottom')
plt.show()

In [None]:
# Compare Training Times Between Models

fig,axis = plt.subplots(figsize=(11, 3))
rmse_y = ['SVD','NMF','Slope One', 'Co-Clustering', 'KNNBasic', 'KNNWithMeans']
rmse_x = [svd_time,nmf_time,slope_time, cc_time, knnb_time, knnm_time]
ax = sns.barplot(y=rmse_y, x=rmse_x,palette='flare')
plt.title('Model Training Time Compared',fontsize=15)
plt.xlabel('Training Time', size=13)
plt.yticks(size=12)
plt.xticks(size=12)
for i in ax.patches:
    ax.text(i.get_x() + i.get_height()*2.45, i.get_y() + i.get_height()-0.2, round(i.get_width(),2), fontsize=12, ha="left", va='bottom')
plt.show()

<a id="nine"></a>
## 9. Hyperparameter Tuning
 - Hyperparameter Tuning of Best performing Model - SVD
 - In collaboration with Comet

In [None]:
df_train = train_data.copy()

# Remove the line of code below to hyperparameter tune on all data.
# Use full data for final training and test data
df_train = df_train.sample(n=20000)

train_data = Dataset.load_from_df(df_train[["userId", "movieId", "rating"]], reader = Reader())
trainset = train_data.build_full_trainset()

# Select hyperparameters to be included in the GridSearch to find optimal.
param_grid = {'n_epochs': [30, 35, 40, 45, 50, 55, 60, 65],
'n_factors': [50, 150, 300, 400],
'lr_all': [0.001, 0.003, 0.004, 0.005],
'reg_all': [0.005, 0.01, 0.02, 0.4, 0.5, 0.8]}

# Apply GridSearchCV & fit to chosen data
svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs = -1)
svd.fit(train_data)

# Prints best hyperparameters
print("Best Parameters", svd.best_params['rmse'])

In [None]:
# Log best parameters & RMSE score to Comet
params = svd.best_params['rmse']
rmse = svd.best_score['rmse']


# Uncomment the below to save results to comet
"""experiment.log_parameters(params)
experiment.log_metric('rmse', rmse)

# End the experiment
experiment.end()"""

<a id="ten"></a>
## 10. Conculsion
- Recommender System
- Performances of all Models
- Best Performing Model


After numerous hyperparameter tuning experiments, we concluded the following as the best parameters.
Best Parameters {'n_epochs': 55, 'n_factors': 150, 'lr_all': 0.0065, 'reg_all': 0.01}


<a id="eleven"></a>
## 11. Submission

In [None]:
ratings_predictions=[svd.predict(row.userId, row.movieId) for _,row in test_data.iterrows()]

In [None]:
df_pred=pd.DataFrame(ratings_predictions)
df_pred=df_pred.rename(columns={'uid':'userId', 'iid':'movieId','est':'rating'})
df_pred.drop(['r_ui','details'],axis=1,inplace=True)

In [None]:
df_pred['Id']=df_pred.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)
df_pred['Id']=df_pred.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)

In [None]:
df_pred.head()

In [None]:
df_pred.drop(['userId', 'movieId'], inplace=True, axis= 1)
df_pred = df_pred[['Id', 'rating']]

In [None]:
df_pred.to_csv("submission.csv", index=False)