# Data Loading

### Import necessary tools

In [1]:
import pandas as pd
import numpy as np
import copy

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import AlgoBase
from surprise import accuracy
from surprise import BaselineOnly

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import train_test_split

In [2]:
# dataset locations
movie_path = "dataset/movies.csv"
rating_path = "dataset/ratings.csv"


### Import datasets

In [3]:
#import anime database
movie_data = pd.read_csv(movie_path,
                         index_col = "movieId",
                         delimiter=',')

In [4]:
movie_data.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
#import anime ratings
rating_data = pd.read_csv(rating_path,
                         delimiter=',')

In [6]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Preprocessing

In [7]:
rating_data.replace(to_replace = -1, value = 1, inplace = True)

In [8]:
rating_data.duplicated((['userId', 'movieId'])).sum()

0

In [9]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
rating_data.drop('timestamp', axis = 1, inplace = True)

In [11]:
rating_data.columns=['user_id','item_id','rating']


In [12]:
rating_data.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [13]:
reader = Reader(rating_scale=(1,5))
utility_data = Dataset.load_from_df(rating_data, reader)

In [14]:
type(utility_data)

surprise.dataset.DatasetAutoFolds

In [15]:
trainset, testset = train_test_split(utility_data, test_size = 0.2, random_state = 123)

In [16]:
type(trainset)

surprise.trainset.Trainset

In [17]:
type(testset)

list

In [18]:
len(utility_data.df), trainset.n_ratings, len(testset)

(100836, 80668, 20168)

# Model Training

## Baseline (using BaselineOnly)

### Create class

### Create baseline model and calculate baseline error

In [19]:
#create model
baseline_model = BaselineOnly()
baseline_model.fit(trainset)

baseline_trained = baseline_model.test(testset)
baseline_rmse = accuracy.rmse(baseline_trained)

Estimating biases using als...
RMSE: 0.8725


We want to makesure, our model has a better rmse

## Training SVD

### Hyperparamater Tuning Using Cross-Validation

In [20]:
SVD_params_cv = {
    'n_epochs': [10, 20, 40, 50, 75, 100],
    'n_factors': [10, 25, 50, 100, 125, 150, 200],
    'lr_all': [0.01, 0.005, 0.002, 0.001],
    'reg_all': [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.05],
}

In [21]:
SVD_cv = RandomizedSearchCV(algo_class=SVD,
                            param_distributions = SVD_params_cv,
                            cv=5,
                            random_state=123
                   )

In [22]:
SVD_cv.fit(data = utility_data)

In [23]:
best_params_svd = SVD_cv.best_params['rmse']
best_params_svd

{'n_epochs': 20, 'n_factors': 50, 'lr_all': 0.01, 'reg_all': 0.05}

### Training SVD with the best hyper-parameters

In [24]:
SVD_model = SVD(**best_params_svd)

In [25]:
SVD_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2383877a4d0>

### Evaluate Model Performance using RMSE

In [26]:
test_predicted = SVD_model.test(testset)

In [27]:
test_rmse = accuracy.rmse(test_predicted)
test_rmse

RMSE: 0.8592


0.8592017513848623

Conclusion: The SVD is better than the baseline

# Prediction

## Create Prediction Function

#### Look for movie the user has not watch => unrated item

In [37]:
#function for unrated item

def get_unrated_item(userid, rating_data):
    
    unique_item_id = set(rating_data['item_id'])
    rated_item_id = set(rating_data.loc[rating_data['user_id']==userid, 'item_id'])

    unrated_item_id = unique_item_id.difference(rated_item_id)

    return unrated_item_id

In [41]:
def get_pred_unrated_item(userid, estimator, unrated_item_id):
    
    pred_dict = {
        'user_id': userid,
        'item_id': [],
        'predicted_rating': []
    }

   
    for id in unrated_item_id:
        pred_id = estimator.predict(uid = pred_dict['user_id'],
                                    iid = id)

        pred_dict['item_id'].append(id)
        pred_dict['predicted_rating'].append(pred_id.est)

    # Create a dataframe
    pred_data = pd.DataFrame(pred_dict).sort_values('predicted_rating',
                                                     ascending = False)

    return pred_data

In [42]:
def get_top_highest_unrated(estimator, k, userid, rating_data, metadata):
    
    unrated_item_id = get_unrated_item(userid=userid, rating_data=rating_data)
    
    predicted_unrated_item = get_pred_unrated_item(userid = userid,
                                                   estimator = estimator,
                                                   unrated_item_id = unrated_item_id)

    # Sort & add metadata
    top_item_pred = predicted_unrated_item.head(k).copy()
    print(top_item_pred)
    top_item_pred_detail = metadata.loc[top_item_pred['item_id'], :]
    

    return top_item_pred_detail


In [43]:
# Generate 10 recommendation for user 500
get_top_highest_unrated(estimator=SVD_model,
                        k=10,
                        userid=23,
                        rating_data=utility_data.df,
                        metadata=movie_data)

      user_id  item_id  predicted_rating
991        23     1204          4.263675
2848       23     3451          4.199439
993        23     1207          4.145748
4763       23     5992          4.136501
759        23      898          4.101772
760        23      899          4.094217
1003       23     1225          4.090675
6940       23   177593          4.069408
972        23     1178          4.066964
227        23      246          4.066529


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1204,Lawrence of Arabia (1962),Adventure|Drama|War
3451,Guess Who's Coming to Dinner (1967),Drama
1207,To Kill a Mockingbird (1962),Drama
5992,"Hours, The (2002)",Drama|Romance
898,"Philadelphia Story, The (1940)",Comedy|Drama|Romance
899,Singin' in the Rain (1952),Comedy|Musical|Romance
1225,Amadeus (1984),Drama
177593,"Three Billboards Outside Ebbing, Missouri (2017)",Crime|Drama
1178,Paths of Glory (1957),Drama|War
246,Hoop Dreams (1994),Documentary
