In [7]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import sys # to manipulate different parts of the Python runtime environment
import pandas as pd 
import numpy as np

import joblib # set of tools to provide lightweight pipelining in Python

# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")
baseline_data = (saved_models_folder + "/" + "baseline")
test_models = (saved_models_folder + "/" + "test_models")

In [8]:
def prepare_for_different_models():
    '''
    The code reads two CSV files (anime.csv and rating.csv.zip) and loads them into dataframes. 
    Then it creates a subset of the rating dataframe containing only rows where the rating is 
    greater than 0 and removes the index column. Next, it samples a subset of the data with 
    a specified size, grouped by the rating column.
    '''
    # Load 'anime.csv' file into a pandas DataFrame object called 'anime'
    anime = pd.read_csv(raw_data + "/" + "anime.csv")

    # Load 'rating.csv.zip' file into a pandas DataFrame object called 'rating'
    rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

    # Create a new DataFrame 'anime_mapping' that is a copy of the 'anime' DataFrame and remove the 'episodes', 'members', and 'rating' columns
    anime_mapping = anime.copy()
    anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

    # Filter out all ratings less than or equal to 0 and reset the index of the DataFrame
    ratingdf = rating[rating.rating>0]
    ratingdf = ratingdf.reset_index()

    # Drop the 'index' column and update the DataFrame in-place
    ratingdf.drop('index', axis=1, inplace=True)

    # Get the shape of the DataFrame 'ratingdf'
    ratingdf.shape

    # Set the size to 1,000,000 and sample from the 'ratingdf' DataFrame based on the proportion of ratings for each score
    size = 1000000

    # This will make sure that the sampled data has roughly the same proportion of ratings for each score as the original data
    ratingdf_sample = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)

    # Create a new 'Reader' object with the rating scale set to a range between 1 and 10
    reader = Reader(rating_scale=(1,10))

    # Load the sampled data into a 'Dataset' object using the 'load_from_df' method and the 'reader' object
    data = Dataset.load_from_df(ratingdf_sample[['user_id', 'anime_id', 'rating']], reader)

    # Saving the table to pickle
    joblib.dump(data,processed_data + "/" + "data_reader_for_different_models.pkl")

    return data
data = prepare_for_different_models()

### SlopeOne

SlopeOne is a fast and simple algorithm for predicting user ratings in collaborative filtering. It computes the average difference in rating (the slope) between pairs of items, and then uses these slopes to predict the rating of a target item for a given user based on their previous ratings for other items. The algorithm is well-suited to large-scale tasks and has been shown to be effective in practice.

SlopeOne does not have any hyperparameters to tune in Surprise library. Therefore, we will perfom the training and get the measures.

In [None]:
def mape(predictions):
    """
    Compute the Mean Absolute Percentage Error (MAPE) for a set of predictions.
    
    Args:
        predictions: list of Prediction objects returned by the test method of an algorithm
        
    Returns:
        The MAPE score
    """
    actual_ratings = np.array([pred.r_ui for pred in predictions])
    predicted_ratings = np.array([pred.est for pred in predictions])
    return np.mean(np.abs(actual_ratings - predicted_ratings) / actual_ratings) * 100

def r2(predictions):
    """
    Compute the R-squared (R2) score for a set of predictions.
    
    Args:
        predictions: list of Prediction objects returned by the test method of an algorithm
        
    Returns:
        The R2 score
    """
    actual_ratings = np.array([pred.r_ui for pred in predictions])
    predicted_ratings = np.array([pred.est for pred in predictions])
    mean_rating = np.mean(actual_ratings)
    ss_tot = np.sum((actual_ratings - mean_rating) ** 2)
    ss_res = np.sum((actual_ratings - predicted_ratings) ** 2)
    return 1 - (ss_res / ss_tot)

In [None]:
# Splits the data into training and testing sets with a 80:20 ratio
trainset, testset = train_test_split(data, test_size=0.2)       

# Create a SlopeOne model and fit it to the training set
model = SlopeOne()
model.fit(trainset)

# Use the model to make predictions on the test set
predictions = model.test(testset)

# Calculates the RMSE and MAE for the predictions
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions) 
mse = accuracy.mse(predictions)
mape_score = mape(predictions)
r2_score = r2(predictions)

print("RMSE:", rmse)
print("MSE:", mse)  
print("MAE:", mae)  
print("mape_score:", mape_score)  
print("r2_score:", r2_score)    

# Saves the trained model as a pickle file using joblib
joblib.dump(model,saved_models_folder + "/" + "SlopeOne_model.pkl")

RMSE: 1.2746
MAE:  0.9617
MSE: 1.6245
RMSE: 1.2745619937093307
MSE: 1.624508275808304
MAE: 0.9617043084174556
mape_score: 15.125942719482898
r2_score: 0.3454769208138584


['d:\\Github\\Anime_recommendation_system\\src/data/saved_models/SlopeOne_model.pkl']

## Evaluating Selected models

We have selected the 4 models with best results in the baseline. Now lets evaluate them doing GridSearchCV and training them to get the results.

- svd = SVD()
- svdp = SVDpp()
- baseonly = BaselineOnly()
- coclus = CoClustering()

In [1]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import sys # to manipulate different parts of the Python runtime environment
import pandas as pd 


# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")
baseline_data = (saved_models_folder + "/" + "baseline")
test_models = (saved_models_folder + "/" + "test_models")



Preparing the data, reducing the sample to 1million rows

In [2]:
def prepare_for_different_models():
    '''
    The code reads two CSV files (anime.csv and rating.csv.zip) and loads them into dataframes. 
    Then it creates a subset of the rating dataframe containing only rows where the rating is 
    greater than 0 and removes the index column. Next, it samples a subset of the data with 
    a specified size, grouped by the rating column.
    '''
    # Load 'anime.csv' file into a pandas DataFrame object called 'anime'
    anime = pd.read_csv(raw_data + "/" + "anime.csv")

    # Load 'rating.csv.zip' file into a pandas DataFrame object called 'rating'
    rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

    # Create a new DataFrame 'anime_mapping' that is a copy of the 'anime' DataFrame and remove the 'episodes', 'members', and 'rating' columns
    anime_mapping = anime.copy()
    anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

    # Filter out all ratings less than or equal to 0 and reset the index of the DataFrame
    ratingdf = rating[rating.rating>0]
    ratingdf = ratingdf.reset_index()

    # Drop the 'index' column and update the DataFrame in-place
    ratingdf.drop('index', axis=1, inplace=True)

    # Get the shape of the DataFrame 'ratingdf'
    ratingdf.shape

    # Set the size to 1,000,000 and sample from the 'ratingdf' DataFrame based on the proportion of ratings for each score
    size = 1000000

    # This will make sure that the sampled data has roughly the same proportion of ratings for each score as the original data
    ratingdf_sample = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)

    # Create a new 'Reader' object with the rating scale set to a range between 1 and 10
    reader = Reader(rating_scale=(1,10))

    # Load the sampled data into a 'Dataset' object using the 'load_from_df' method and the 'reader' object
    data = Dataset.load_from_df(ratingdf_sample[['user_id', 'anime_id', 'rating']], reader)

    # Saving the table to pickle
    #joblib.dump(data,processed_data + "/" + "data_reader_for_different_models.pkl")

    return data
data = prepare_for_different_models()

### SVD

Singular Value Decomposition (SVD) is a matrix factorization technique used in recommendation systems to reduce the dimensionality of a user-item matrix and identify latent factors that drive user-item interactions. In essence, SVD represents the original matrix as the product of three matrices: a user matrix, a singular value matrix, and an item matrix. The resulting factors can be used to make personalized recommendations by predicting a user's preference for an item based on their past behavior and the behavior of other similar users.

Here are some of the important hyperparameters for the SVD algorithm in the Surprise library:

- **n_factors:** The number of factors to use in the matrix factorization. This controls the number of latent features to be learned from the data.
- **n_epochs:** The number of epochs (iterations) to run the matrix factorization algorithm.
- **biased:** A boolean indicating whether or not to use biases in the model. Biases represent the average rating for each user and item, and can help improve the accuracy of the predictions.
- **lr_all:** The learning rate for all parameters in the model. This controls the step size for each iteration of the optimization algorithm.
- **reg_all:** The regularization strength for all parameters in the model. This helps prevent overfitting by adding a penalty term to the optimization objective that encourages the model to have smaller parameter values.
- **init_mean:** The mean of the Gaussian distribution used to initialize the factor matrices. By default, this is set to 0.
- **init_std_dev:** The standard deviation of the Gaussian distribution used to initialize the factor matrices. By default, this is set to 0.1.

Steps of find_best_svd:
- Define parameter grid for grid search
- Create GridSearchCV object with SVD algorithm
- Fit GridSearchCV object to data
- Print best RMSE and MAE scores, as well as corresponding parameters
- Save model with best parameters
- Save best parameters

In [None]:
def find_best_svd(data):
    '''
    defines a parameter grid for hyperparameter tuning in a collaborative filtering algorithm.
    Then create a GridSearchCV object with the SVD algorithm and a parameter grid consisting 
    of a range of hyperparameters. The GridSearchCV function then performs a grid search on 
    yhe parameter grid to find the best combination of hyperparameters that minimizes the 
    RMSE and MAE scores. The best RMSE and MAE scores and the corresponding parameters 
    are printed out.
    '''
    from surprise import SVD
    from surprise.model_selection import GridSearchCV
    data = data

    # Define parameter grid for grid search
    param_grid = {'n_factors': [50, 100, 150, 300], 
                'n_epochs': [20, 30, 40, 50], 
                'lr_all': [0.002, 0.005, 0.01, 0.1],
                'reg_all': [0.02, 0.05, 0.1]}

    # Create GridSearchCV object with SVD algorithmr
    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae','mse'], cv=5)

    # Fit GridSearchCV object to data
    gs.fit(data)

    # Print best RMSE and MAE scores, as well as corresponding parameters
    print("Best RMSE score:", gs.best_score['rmse'])
    print("Best MSE score:", gs.best_score['mse'])
    print("Best MAE score:", gs.best_score['mae'])
    print("Best parameters for RMSE:", gs.best_params['rmse'])
    print("Best parameters for MAE:", gs.best_params['mae'])
    print("Best parameters for MSE:", gs.best_params['mse'])
    

    # Save model with best parameters
    joblib.dump(gs,test_models + "/" + "SVD_test_model.pkl")

    # Save best parameters
    joblib.dump(gs.best_params,test_models + "/" + "SVD_best_params_test_model.pkl", compress = 1)

    return gs
best_params = find_best_svd(data)

Best RMSE score: 1.1183239072148927
Best MAE score: 0.8364221843283999
Best parameters for RMSE: {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.05}
Best parameters for MAE: {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.05}


<surprise.model_selection.search.GridSearchCV at 0x1bd2c815b80>

### SVDpp

SVD++ is an extension of Singular Value Decomposition (SVD) that takes into account implicit feedback in recommendation systems. In addition to the user-item matrix used in SVD, SVD++ also considers a matrix of implicit feedback such as user interactions with items, item attributes, and user preferences. This additional matrix helps capture the influence of user and item biases on the recommendation process, resulting in more accurate and personalized recommendations. SVD++ also includes a regularization term to avoid overfitting and improve generalization. Overall, SVD++ is a more advanced and sophisticated technique for recommendation systems than SVD.

The SVDpp algorithm in the Surprise library has the following hyperparameters:

- **n_factors:** the number of latent factors. Default is 20.
- **n_epochs:** the number of iterations for the optimization algorithms. Default is 20.
- **lr_all:** the learning rate for all parameters. Default is 0.005.
- **reg_all:** the regularization parameter for all parameters. Default is 0.02.
- **init_mean:** the mean of the normal distribution for factor vectors initialization. Default is 0.
- **init_std_dev:** the standard deviation of the normal distribution for factor vectors initialization. Default is 0.1.
- **verbose:** whether to print details during the optimization process. Default is False.

In [3]:

def find_best_svdpp(data):
    from surprise import SVDpp
    from surprise.model_selection import GridSearchCV
    import joblib

    # Define parameter grid for grid search
    param_grid = {'n_factors': [50, 100, 150], 
                  'n_epochs': [20, 30, 40], 
                  'lr_all': [0.002, 0.005, 0.01],
                  'reg_all': [0.02, 0.05, 0.1]
                  }

    # Create GridSearchCV object with SVDpp algorithm
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae','mse'], cv=5)

    # Fit GridSearchCV object to data
    gs.fit(data)

    # Print best RMSE and MAE scores, as well as corresponding parameters
    print("Best RMSE score:", gs.best_score['rmse'])
    print("Best MSE score:", gs.best_score['mse'])
    print("Best MAE score:", gs.best_score['mae'])
    print("Best parameters for RMSE:", gs.best_params['rmse'])
    print("Best parameters for MAE:", gs.best_params['mae'])
    print("Best parameters for MSE:", gs.best_params['mse'])

    # Save model with best parameters
    joblib.dump(gs,test_models + "/" + "SVDpp_test_model.pkl")

    # Save best parameters
    joblib.dump(gs.best_params,test_models + "/" + "SVDpp_best_params_test_model.pkl", compress = 1)

    return gs
find_best_svdpp(data)


### BaselineOnly

BaselineOnly is a simple but effective collaborative filtering algorithm used in recommendation systems. It predicts a user's rating for an item by taking into account the overall average rating of all items, the average rating of the user, and the average rating of the item. These three values are used as baseline estimates, and the difference between the actual rating and the baseline estimate is used as the prediction error. BaselineOnly then learns user and item biases that can improve the accuracy of the baseline estimates. The algorithm is simple and computationally efficient, making it a popular choice for recommendation systems with large datasets. However, it may not capture more complex relationships between users and items compared to more advanced techniques such as matrix factorization.BaselineOnly is a simple but effective collaborative filtering algorithm used in recommendation systems. It predicts a user's rating for an item by taking into account the overall average rating of all items, the average rating of the user, and the average rating of the item. These three values are used as baseline estimates, and the difference between the actual rating and the baseline estimate is used as the prediction error. BaselineOnly then learns user and item biases that can improve the accuracy of the baseline estimates. The algorithm is simple and computationally efficient, making it a popular choice for recommendation systems with large datasets. However, it may not capture more complex relationships between users and items compared to more advanced techniques such as matrix factorization.

The BaselineOnly algorithm in the Surprise library has the following hyperparameters:
- **bsl_options:** a dictionary containing the following options for the baseline estimates:
    - **method:** the method used to compute the baseline estimates. Possible values are: als, sgd. Default is als.
    - **n_epochs:** the number of iterations for the optimization algorithms. Default is 20.
    - **reg_u:** the regularization parameter for users. Default is 15.
    - **reg_i:** the regularization parameter for items. Default is 10.
    - **verbose**: whether to print details during the optimization process. Default is False.
- **biased:** whether to include the baseline estimate in the prediction. Default is True.

In [None]:
%%capture cap --no-stderr

def find_best_BaselineOnly(data):
    from surprise import BaselineOnly

    from surprise.model_selection import GridSearchCV
    import joblib

    # Parameters docs and value ranges:
    # http://surprise.readthedocs.io/en/stable/prediction_algorithms.html#baseline-estimates-configuration
    # http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf
 
     # Define parameter grid for grid search

    param_grid = {'bsl_options': {'method': ['sgd', 'als'],
                                'reg': [0.02, 0.05, 0.1],
                                'learning_rate': [0.001, 0.005, 0.01],
                                'n_epochs': [5, 10, 15],
                                'verbose': [True]},
                'verbose': [True]}

    # Create GridSearchCV object with SVDpp algorithm
    gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse', 'mae','mse'], cv=5, n_jobs=1)

    # Fit GridSearchCV object to data
    gs.fit(data)

    # Print best RMSE and MAE scores, as well as corresponding parameters
    print("Best RMSE score:", gs.best_score['rmse'])
    print("Best MSE score:", gs.best_score['mse'])
    print("Best MAE score:", gs.best_score['mae'])
    print("Best parameters for RMSE:", gs.best_params['rmse'])
    print("Best parameters for MAE:", gs.best_params['mae'])
    print("Best parameters for MSE:", gs.best_params['mse'])

    # Save model with best parameters
    joblib.dump(gs,test_models + "/" + "BaselineOnly_test_model.pkl")

    # Save best parameters
    joblib.dump(gs.best_params,test_models + "/" + "BaselineOnly_best_params_test_model.pkl", compress = 1)

    
    return gs

find_best_BaselineOnly(data)

#### Saving the output of this cell to a file
# Check if the file exists
if os.path.exists(os.path.join(test_models + "/" + "capture_BaselineOnly.txt")):

    # If the file exists, open it in "a" mode to append to the end
    with open(os.path.join(test_models + "/" + "capture_BaselineOnly.txt"), "a") as f:
        
        # Write the captured output to the end of the file
        f.write(cap.stdout)       
else:
    # If the file does not exist, create it and open it in "w" mode to write to it
    with open(os.path.join(test_models + "/" + "capture_BaselineOnly.txt"), "w") as f:

        # Write the captured output to the file
        f.write(cap.stdout)

- Best RMSE score: 1.5566270718662043
- Best MSE score: 2.426553220392917
- Best MAE score: 1.2098539052864254
- Best parameters for RMSE: {'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.01, 'n_epochs': 15, 'verbose': True}, 'verbose': True}
- Best parameters for MAE: {'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.01, 'n_epochs': 15, 'verbose': True}, 'verbose': True}
- Best parameters for MSE: {'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.01, 'n_epochs': 15, 'verbose': True}, 'verbose': True}

### CoClustering

CoClustering is a collaborative filtering algorithm for recommendation systems that groups users and items into clusters and then estimates the ratings based on the interactions between these clusters. The algorithm tries to find a block diagonal structure in the user-item matrix, where users and items are clustered together. This approach can be more effective than traditional matrix factorization techniques in cases where users or items have similar tastes or properties. The CoClustering algorithm is available in the Surprise library, which is a popular Python library for building and evaluating recommendation systems.

Here's a brief explanation of these hyperparameters:

- **n_cltr_u** and **n_cltr_i:** the number of user and item clusters, respectively. This is a crucial hyperparameter for CoClustering, as it determines the level of granularity of the clustering. Higher values will generally result in better performance but may also increase training time.
- **n_epochs:** the number of epochs or iterations to run the algorithm for. This is also an important hyperparameter, as it determines the number of times the algorithm will iterate over the data. More iterations can lead to better performance but may also increase training time.
- **verbose:** a boolean flag indicating whether or not to print progress messages during training.
- **random_state:** an integer value representing the random seed used to initialize the algorithm's parameters. This can be useful for reproducibility.

In [None]:
# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import CoClustering

In [None]:
%%capture cap --no-stderr
def find_best_coclustering(data):


    # Define parameter grid for grid search
    param_grid = {'n_cltr_u': [3, 5, 10], 
                'n_cltr_i': [3, 5, 10], 
                'n_epochs': [20, 30, 40], 
                'verbose': [True, False], 
                'random_state': [42, 123]
                }

    # Create GridSearchCV object with SVDpp algorithm
    gs = GridSearchCV(CoClustering, param_grid, measures=['rmse', 'mae','mse'], cv=5)

    # Fit GridSearchCV object to data
    gs.fit(data)

    # Print best RMSE and MAE scores, as well as corresponding parameters
    print("Best RMSE score:", gs.best_score['rmse'])
    print("Best MSE score:", gs.best_score['mse'])
    print("Best MAE score:", gs.best_score['mae'])
    print("Best parameters for RMSE:", gs.best_params['rmse'])
    print("Best parameters for MAE:", gs.best_params['mae'])
    print("Best parameters for MSE:", gs.best_params['mse'])

    # Save model with best parameters
    joblib.dump(gs,test_models + "/" + "CoClustering_test_model.pkl")

    # Save best parameters
    joblib.dump(gs.best_params,test_models + "/" + "CoClustering_best_params_test_model.pkl", compress = 1)

    return gs
find_best_coclustering(data)


#### Saving the output of this cell to a file
# Check if the file exists
if os.path.exists(os.path.join(test_models + "/" + "capture_CoClustering.txt")):

    # If the file exists, open it in "a" mode to append to the end
    with open(os.path.join(test_models + "/" + "capture_CoClustering.txt"), "a") as f:
        
        # Write the captured output to the end of the file
        f.write(cap.stdout)       
else:
    # If the file does not exist, create it and open it in "w" mode to write to it
    with open(os.path.join(test_models + "/" + "capture_CoClustering.txt"), "w") as f:

        # Write the captured output to the file
        f.write(cap.stdout)
