In [1]:
import pandas as pd
import numpy as np
from surprise import BaselineOnly, NormalPredictor
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, KFold, cross_validate
from tabulate import tabulate

# Read and load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [3]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


# Random (Normal) Predictor

In [4]:
# List that will contain the RMSE and MAE results
results_table = []

In [5]:
kf = KFold(n_splits=5) # define number of k splits for cross validation
print("Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...")
out = cross_validate(NormalPredictor(), movielens_dataset, ["rmse", "mae"], kf)
mean_rmse = "{:.3f}.".format(np.mean(out["test_rmse"]))
mean_mae = "{:.3f}.".format(np.mean(out["test_mae"]))
new_line = ["ML100K-Random", mean_rmse, mean_mae]
results_table.append(new_line)

Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...


In [6]:
print("Running 5-fold cross validation with Random Predictor on PDA2018 dataset ...")
out = cross_validate(NormalPredictor(), pda_dataset, ["rmse", "mae"], kf)
mean_rmse = "{:.3f}.".format(np.mean(out["test_rmse"]))
mean_mae = "{:.3f}.".format(np.mean(out["test_mae"]))
new_line = ["PDA2018-Random", mean_rmse, mean_mae]
results_table.append(new_line)

Running 5-fold cross validation with Random Predictor on PDA2018 dataset ...


# Baseline (Mean + Biases) Predictor

In [8]:
# For this algorithm we can run GridSearch CrossValidation, which will help us find the best parameters 
# and also the best RMSE and MAE values
# Define the parameters and their respective ranges
param_grid = {
    "bsl_options": {
        "method": ["als", "sgd"],
        "reg": [0.01, 0.5], 
        "n_epochs": [5,20]
    }
}

## RUN 1: MOVIELENS

# Run grid search for the specified algorithm and the parameter grid
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for the BaselineOnly algorithm ...")
# Fit the moviesdata to the model using the parameters of the grid search
gs.fit(movielens_dataset)
# Save the best parameters of the models and the best scores
results_table.append(["ML100-Baseline", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for the BaselineOnly algorithm ...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating bias

In [9]:
## RUN 2: PDA2018

# Run grid search for the specified algorithm and the parameter grid
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse", "mae"], cv=5) # 5-fold CV
print("Running grid search for the BaselineOnly algorithm ...")
# Fit the moviesdata to the model using the parameters of the grid search
gs.fit(pda_dataset)
# Save the best parameters of the models and the best scores
results_table.append(["PDA-Baseline", '{:.3f}.'.format(gs.best_score["rmse"]), '{:.3f}.'.format(gs.best_score["mae"])])
# Print out the the best RMSE, MAE and the respective model parameters
print("\nResults:")
print("Best RMSE:", gs.best_score["rmse"])
print("Best params for RMSE", gs.best_params["rmse"])
print("Best MAE:", gs.best_score["mae"])
print("Best params for MAE", gs.best_params["mae"])
print("\n")

Running grid search for the BaselineOnly algorithm ...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating bias

In [1]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "RMSE", "MAE"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

NameError: name 'tabulate' is not defined

In [11]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "RMSE", "MAE"])
results_df.to_csv("../data/basic_algorithms_results.csv")