# Netflix Challange

### Imports

In [18]:
import pandas as pd
from surprise import SVD, SVDpp, Reader, Dataset
from surprise.model_selection import GridSearchCV

### Read data from files

In [13]:
movies = pd.read_csv("resources/movies.csv", delimiter=";", escapechar="\\", index_col="ID")
users = pd.read_csv("resources/users.csv", delimiter=";", index_col="ID")
ratings = pd.read_csv("resources/ratings.csv", delimiter=";", dtype= str)
predictions = pd.read_csv("resources/predictions.csv", delimiter=";", dtype= str)

### Transform train data to have correct format

In [14]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings[['UserID', "MovieID", "Rating"]], reader)
trainset = data.build_full_trainset()

### Using GridSearch to improve hyper parameter `n_factors`

In [52]:
params = {'n_factors': range(10, 130, 10), 'n_epochs': [24], 'lr_all':[0.0044] }
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave n_factors: 30. Since the steps were quite large, we will do another tuning with smaller steps

In [None]:
params = {'n_factors': range(20, 40, 1), 'n_epochs': [24], 'lr_all':[0.0044] }
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave n_factors: 35

### Using GridSearch to improve hyper parameter `n_epochs`

In [44]:
params = {'n_epochs': range(5, 25, 5), 'n_factors': [35]}
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave n_epochs: 20. Since the steps were quite large, we will do another tuning with smaller steps

In [None]:
params = {'n_epochs': range(18, 25, 1), 'n_factors': [35]}
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave n_epochs: 24

### Using GridSearch to improve hyper parameter `lr_all=0.005`

In [None]:
params = {'lr_all': [x / 1000 for x in range(1, 10, 1)], 'n_factors': [35], 'n_epochs': [24] }
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave lr_all: 0.005. Since the steps were quite large, we will do another tuning with smaller steps

In [50]:
params = {'lr_all': [x / 10000 for x in range(40, 45, 1)], 'n_factors': [35], 'n_epochs': [24] }
gridSearch = GridSearchCV(SVD, params, measures=['RMSE'], cv=3)
gridSearch.fit(data)

The above gave lr_all: 0.0044

### Get best parameters

In [53]:
print(gridSearch.best_params)

{'rmse': {'n_factors': 35, 'n_epochs': 24, 'lr_all': 0.0044}}


### Best parameters
To recap, we had the following best results from the ranges:
n_factors: 30 from range(10, 130, 10), 35 from range(20, 40, 1)
n_epochs: 20 from range(5, 25, 5), 24 from range(18, 25, 1)
lr_all: 0.005 from range(0.001, 0.01, 0.001), 0.0044 from range(0.0040, 0.0055, 0.0001)
So, final parameters used are:
n_factors: 35, n_epochs: 24, lr_all: 0.0044

### Create an instance of SVDpp with the hyper parameters that were derived above and predict

In [54]:
svd = SVDpp(n_factors=35, n_epochs=24, lr_all=0.0044)
svd.fit(trainset)
recommendations = []

for ind, rating in predictions.iterrows():
   predict = svd.predict(rating['UserID'], rating['MovieID'])
   recommendations.append([ind + 1, predict.est])

### Create CSV file with solution

In [55]:
predict_df = pd.DataFrame(recommendations, columns=['Id', 'Rating'])
predict_df.to_csv('recommendations.csv', index=False)
print('Done')

Done
