# Data Loading

### Import necessary tools

In [1]:
import pandas as pd
import numpy as np
import copy

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import AlgoBase
from surprise import accuracy
from surprise import BaselineOnly

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import train_test_split

In [2]:
# dataset locations
movie_path = "dataset/movies.csv"
rating_path = "dataset/ratings.csv"


### Import datasets

In [3]:
#import movie database
movie_data = pd.read_csv(movie_path,
                         index_col = "movieId",
                         delimiter=',')

In [4]:
movie_data.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
#import movie ratings
rating_data = pd.read_csv(rating_path,
                         delimiter=',')

In [6]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Preprocessing

In [7]:
rating_data.duplicated((['userId', 'movieId'])).sum()

0

In [8]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
rating_data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [10]:
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [11]:
rating_data.drop('timestamp', axis = 1, inplace = True)

In [12]:
rating_data.columns=['user_id','item_id','rating']


In [13]:
rating_data.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [14]:
reader = Reader(rating_scale=(0,5))
utility_data = Dataset.load_from_df(rating_data, reader)

In [15]:
type(utility_data)

surprise.dataset.DatasetAutoFolds

In [16]:
trainset, testset = train_test_split(utility_data, test_size = 0.2, random_state = 123)

In [17]:
type(trainset)

surprise.trainset.Trainset

In [18]:
type(testset)

list

In [19]:
print("complete utility data = ", len(utility_data.df))
print("train set utility data = ", trainset.n_ratings)
print("complete utility data = ", len(testset))

complete utility data =  100836
train set utility data =  80668
complete utility data =  20168


# Model Training

## Baseline (using BaselineOnly)

### Create class

### Create baseline model and calculate baseline error

In [20]:
#create model
baseline_model = BaselineOnly()
baseline_model.fit(trainset)

baseline_trained = baseline_model.test(testset)
baseline_rmse = accuracy.rmse(baseline_trained)

Estimating biases using als...
RMSE: 0.8725


We want to makesure, our model has a better rmse

## Training SVD

### Hyperparamater Tuning Using Cross-Validation

In [21]:
SVD_params_cv = {
    'n_epochs': [10, 20, 40, 50, 75, 100],
    'n_factors': [10, 25, 50, 100, 125, 150, 200],
    'lr_all': [0.01, 0.005, 0.002, 0.001],
    'reg_all': [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.05],
}

In [22]:
SVD_cv = RandomizedSearchCV(algo_class=SVD,
                            param_distributions = SVD_params_cv,
                            cv=5,
                            random_state=123
                   )

In [23]:
SVD_cv.fit(data = utility_data)

In [24]:
best_params_svd = SVD_cv.best_params['rmse']
best_params_svd

{'n_epochs': 100, 'n_factors': 50, 'lr_all': 0.002, 'reg_all': 0.05}

### Training SVD with the best hyper-parameters

In [25]:
SVD_model = SVD(**best_params_svd)

In [26]:
SVD_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25c59ad6590>

### Evaluate Model Performance using RMSE

In [27]:
test_predicted = SVD_model.test(testset)

In [28]:
test_rmse = accuracy.rmse(test_predicted)
test_rmse

RMSE: 0.8576


0.8576225145105821

Conclusion: The SVD is better than the baseline

# Prediction

## Create Prediction Function

#### Look for movie the user has not watch => unrated item

In [29]:
#function for unrated item

def get_unrated_item(userid, rating_data):
    
    unique_item_id = set(rating_data['item_id'])
    rated_item_id = set(rating_data.loc[rating_data['user_id']==userid, 'item_id'])

    unrated_item_id = unique_item_id.difference(rated_item_id)

    return unrated_item_id

In [30]:
def get_pred_unrated_item(userid, estimator, unrated_item_id):
    
    pred_dict = {
        'user_id': userid,
        'item_id': [],
        'predicted_rating': []
    }

   
    for id in unrated_item_id:
        pred_id = estimator.predict(uid = pred_dict['user_id'],
                                    iid = id)

        pred_dict['item_id'].append(id)
        pred_dict['predicted_rating'].append(pred_id.est)

    # Create a dataframe
    pred_data = pd.DataFrame(pred_dict).sort_values('predicted_rating',
                                                     ascending = False)

    return pred_data

In [31]:
def get_top_highest_unrated(estimator, k, userid, rating_data, metadata):
    
    unrated_item_id = get_unrated_item(userid=userid, rating_data=rating_data)
    
    predicted_unrated_item = get_pred_unrated_item(userid = userid,
                                                   estimator = estimator,
                                                   unrated_item_id = unrated_item_id)

    # Sort & add metadata
    top_item_pred = predicted_unrated_item.head(k).copy()
    top_item_pred_detail = metadata.loc[top_item_pred['item_id'], :]
    

    return top_item_pred_detail


In [32]:
# Generate 10 recommendation for user 500
get_top_highest_unrated(estimator=SVD_model,
                        k=10,
                        userid=23,
                        rating_data=utility_data.df,
                        metadata=movie_data)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
475,In the Name of the Father (1993),Drama
1204,Lawrence of Arabia (1962),Adventure|Drama|War
2360,"Celebration, The (Festen) (1998)",Drama
933,To Catch a Thief (1955),Crime|Mystery|Romance|Thriller
951,His Girl Friday (1940),Comedy|Romance
5690,Grave of the Fireflies (Hotaru no haka) (1988),Animation|Drama|War
246,Hoop Dreams (1994),Documentary
720,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy
2239,Swept Away (Travolti da un insolito destino ne...,Comedy|Drama
27773,Old Boy (2003),Mystery|Thriller
