In [1]:
import pandas as pd

df = pd.read_csv("./dataset.csv")
df = df[["item", "user", "rating"]]

df.to_csv("./dataset_clean.csv", index=False)

import os

from surprise import BaselineOnly, Dataset, Reader, NormalPredictor, KNNBasic
from surprise.model_selection import cross_validate

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(KNNBasic(), data, cv=2)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.13536128, 1.13889562]),
 'test_mae': array([0.82837501, 0.83020064]),
 'fit_time': (0.01875138282775879, 0.029002904891967773),
 'test_time': (0.9367372989654541, 0.9275250434875488)}

In [3]:
"""This module runs a 5-Fold CV for all the algorithms (default parameters) on
the movielens datasets, and reports average RMSE, MAE, and total computation
time.  It is used for making tables in the README.md file"""

# flake8: noqa

import datetime
import random
import time

import numpy as np

from surprise import (
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    NMF,
    NormalPredictor,
    SlopeOne,
    SVD,
    SVDpp,
)
from surprise.model_selection import cross_validate, KFold
from tabulate import tabulate

# The algorithms to cross-validate
algos = (
    SVD(random_state=0),
    SVDpp(random_state=0, cache_ratings=False),
    SVDpp(random_state=0, cache_ratings=True),
    NMF(random_state=0),
    SlopeOne(),
    KNNBasic(),
    KNNWithMeans(),
    KNNBaseline(),
    CoClustering(random_state=0),
    BaselineOnly(),
    NormalPredictor(),
)

# ugly dict to map algo names and datasets to their markdown links in the table
stable = "https://surprise.readthedocs.io/en/stable/"
LINK = {
    "SVD": "[{}]({})".format(
        "SVD",
        stable
        + "matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD",
    ),
    "SVDpp": "[{}]({})".format(
        "SVD++",
        stable
        + "matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp",
    ),
    "NMF": "[{}]({})".format(
        "NMF",
        stable
        + "matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF",
    ),
    "SlopeOne": "[{}]({})".format(
        "Slope One",
        stable + "slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne",
    ),
    "KNNBasic": "[{}]({})".format(
        "k-NN",
        stable + "knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic",
    ),
    "KNNWithMeans": "[{}]({})".format(
        "Centered k-NN",
        stable + "knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans",
    ),
    "KNNBaseline": "[{}]({})".format(
        "k-NN Baseline",
        stable + "knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline",
    ),
    "CoClustering": "[{}]({})".format(
        "Co-Clustering",
        stable
        + "co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering",
    ),
    "BaselineOnly": "[{}]({})".format(
        "Baseline",
        stable
        + "basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly",
    ),
    "NormalPredictor": "[{}]({})".format(
        "Random",
        stable
        + "basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor",
    ),
}


# set RNG
np.random.seed(0)
random.seed(0)


kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []
res = []
for algo in algos:
    start = time.time()
    out = cross_validate(algo, data, ["rmse", "mae"], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[algo.__class__.__name__]
    mean_rmse = "{:.3f}".format(np.mean(out["test_rmse"]))
    mean_mae = "{:.3f}".format(np.mean(out["test_mae"]))

    std_rmse = "{:.3f}".format(np.std(out["test_rmse"]))
    std_mae = "{:.3f}".format(np.std(out["test_mae"]))

    res.append({"algo": algo.__class__.__name__,  "out": out})

    new_line = [link, mean_rmse, std_rmse, mean_mae, std_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)

header = ["OUR", "RMSE",  "RMSE std", "MAE", "MAE std", "Time"]
print(tabulate(table, header, tablefmt="pipe"))

|:-----------------------------------------------------------------------------------------------------------------------------------|------:|------:|------:|------:|:--------|
| [SVD](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD) | 0.978 | 0.001 | 0.765 | 0.002 | 0:00:09 |
|:---------------------------------------------------------------------------------------------------------------------------------------|------:|------:|------:|------:|:--------|
| [SVD++](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp) | 0.849 | 0.002 | 0.636 | 0.002 | 0:04:52 |
|:---------------------------------------------------------------------------------------------------------------------------------------|------:|------:|------:|------:|:--------|
| [SVD++](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algor

In [4]:
import scipy.stats as st
acc = []
for d in res:
    acc.append(pd.DataFrame({
        "name": d["algo"],
        "rmse": np.mean(d["out"]["test_rmse"]),
        "mae": np.mean(d["out"]["test_mae"]),
        "rmse_ci": st.t.interval(confidence=0.95, df=len(d["out"]["test_rmse"])-1, loc=np.mean(d["out"]["test_rmse"]), scale=st.sem(d["out"]["test_rmse"]))[1] - np.mean(d["out"]["test_rmse"]),
        "mae_ci": st.t.interval(confidence=0.95, df=len(d["out"]["test_mae"])-1, loc=np.mean(d["out"]["test_mae"]), scale=st.sem(d["out"]["test_mae"]))[1] - np.mean(d["out"]["test_mae"]),
    },
    index=[0]))

df = pd.concat(acc)
df.to_csv("./results.csv", index=False)

In [10]:
# round to 2 decimals
df = df.round(2)
df.T.head(5)

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10
name,SVD,SVDpp,SVDpp,NMF,SlopeOne,KNNBasic,KNNWithMeans,KNNBaseline,CoClustering,BaselineOnly,NormalPredictor
rmse,0.98,0.85,0.85,1.54,1.13,1.08,1.01,1.01,0.98,1.15,2.13
mae,0.77,0.64,0.64,1.29,0.88,0.79,0.77,0.77,0.74,0.93,1.71
rmse_ci,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01
mae_ci,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01
