In [5]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# deal with sparse data libraries
from scipy.sparse import csr_matrix # Returns a copy of column i of the matrix, as a (m x 1) CSR matrix (column vector).

# visualization
#import seaborn as sns # data visualization library based on matplotlib.
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

## scikit Preprocessing data libraries
from sklearn.preprocessing import MinMaxScaler # Transform features by scaling each feature to a given range.

## scikit Feature Extraction libraries
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts.

## scikit Pairwise metrics libraries
#implements utilities to evaluate pairwise distances or affinity of sets of samples.
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel 

## scikit Cross validation iterators libraries
from sklearn.model_selection import GridSearchCV

# Unsupervised learner for implementing neighbor searches.
from sklearn.neighbors import NearestNeighbors

# setting display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Utils libraries
from utils import cleaning
from utils import recommend
from utils import testing
from utils import training

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "/" + "data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")
content_based_supervised_data = (data_folder + "/" + "processed" + "/" + "content_based_supervised")



In [26]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt
from surprise.model_selection import GridSearchCV

# Load the ratings data
df_rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

size = 1000
sample = df_rating.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(df_rating))))).sample(frac=1).reset_index(drop=True)

reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(sample[['user_id', 'anime_id', 'rating']], reader)

In [48]:
sim_options = {
    'name': 'MSD',
    'user_based': 'True'
}

cv_results = []
for k in range(1, 11):
  clf = KNNBasic(k= k*10, sim_options = sim_options)
  cv_results.append(cross_validate(clf, data, measures=['MAE'], cv=5, verbose=4))

for i in range(10):
  print("Average MAE for k = {} ".format((i+1)*10), cv_results[i]["test_mae"].mean())

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     3.7250  3.4100  3.4150  3.6050  3.3166  3.4943  0.1486  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.01    0.00    0.00    0.00    0.00    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarit

In [27]:
kf = KFold(n_splits=3)
algo = KNNBasic()
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_pred = predictions

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.8152
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.9357
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.9463


In [29]:
kf = KFold(n_splits=5)
sim_options = {'name':'cosine'}
algo = KNNWithMeans(sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse= rmse
        best_pred = predictions
print(best_rmse)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.6681
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.8814
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.9825
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 4.0311
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 3.9194
3.668105778191245


In [30]:
kf = KFold(n_splits=3)
algo = KNNBaseline(k=3)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_rmse = rmse
        best_algo = algo
        best_pred = predictions

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.8602
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.8973
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.9406


In [None]:
sim_options = {'name': ['cosine', 'pearson', 'msd', 'pearson_baseline'],
                              'user_based': [True, False],
                              'min_support': [1, 5, 10],
                              'shrinkage': [0, 10, 20]}

In [46]:
sim_options = { 'name': 'pearson_baseline' ,'user_based':  True,'min_support':5,'shrinkage':10}

kf = KFold(n_splits=5)
algo = KNNWithMeans(k =3 , sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_rmse= rmse
        best_algo = algo
        best_pred = predictions

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 3.9115
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 3.7350
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 4.0491
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 3.9217
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 3.8684


In [12]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt
from surprise.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Load the ratings data
df_rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_rating[['user_id', 'anime_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size = .2)

In [None]:
# Use StandardScaler for normalization
scaler = StandardScaler()
parameters = {'n_factors': [20, 50, 80, 100],
              'reg_all': [0.04, 0.06, 0.08],
              'n_epochs': [10, 20, 30],
              'lr_all': [0.002, 0.005, 0.01, 0.02],
              'biased': [True, False],
              'init_mean': [0, 0.1, 0.2],
              'init_std_dev': [0.01, 0.05, 0.1],
              'random_state': [42],
              'verbose': [True, False],
              'reg_pu': [0.1, 0.2, 0.3],
              'reg_qi': [0.1, 0.2, 0.3],
              'lr_bu': [0.001, 0.005, 0.01],
              'lr_bi': [0.001, 0.005, 0.01],
              'lr_pu': [0.001, 0.005, 0.01],
              'lr_qi': [0.001, 0.005, 0.01],
              'lr_bu': [0.001, 0.005, 0.01]
             }
gridsvd = GridSearchCV(SVD, param_grid=parameters, cv=5, n_jobs=-1)


In [17]:
parameters = {'n_factors':[20,50,80,100],
               'reg_all':[0.04,0.06],
               'n_epochs':[10,20,30],
               'lr_all':[.002,.005,.01],
               'verbose':[4]
            }
gridsvd = GridSearchCV(SVD,param_grid=parameters, cv=5,n_jobs=-1)

In [18]:
gridsvd.fit(data)
# Save model with best parameters
joblib.dump(gridsvd,saved_models_folder + "/" + "SVD_new_model.pkl")

In [69]:
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 3.722748077384465, 'mae': 3.3560999378872913}
{'rmse': {'n_factors': 20, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01, 'verbose': 4}, 'mae': {'n_factors': 20, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01, 'verbose': 4}}
