In [7]:
import pandas as pd

from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import KFold

csv_path = "../Data/sampled_data.csv"

In [8]:
def baseline_svd(data, n_splits, n_epochs):
    # Define a k-fold cross-validation iterator
    kf = KFold(n_splits = n_splits)
    # Instantiate the SVD class from scikit-surprise
    svd = SVD(n_epochs = n_epochs, verbose = True)
    rmse, mae = [], []
    # Calculate RMSE and MAE with 4-fold cross-validation
    for trainset, testset in kf.split(data):
        # train and test algorithm
        svd.fit(trainset)
        predictions = svd.test(testset)
        # Compute and append Root Mean Squared Error
        rmse.append(accuracy.rmse(predictions, verbose = True))
        # Compute and append Mean Absolute Error
        mae.append(accuracy.mae(predictions, verbose = True))

    return (predictions, rmse, mae)

In [9]:
# Load the dataset into a Pandas Dataframe
df = pd.read_csv(csv_path, on_bad_lines = "skip")
print(df.head())

# Instantiate a Reader object that expects data in the format (user, item, rating) and specify a rating scale
reader = Reader(rating_scale = (1, 5))
# Create a scikit-surprise dataset from Pandas dataframe, using the Reader object
data = Dataset.load_from_df(df, reader)

predictions, rmse, mae = baseline_svd(data, n_splits = 4, n_epochs = 25)

       reviewerID        asin  overall
0  A3T4SEC8L3Z2CD  B00MG73JV2      5.0
1  A2MZZGN34MOR92  B000VSM4MS      1.0
2   A44F16ME180US  B00N515IB2      5.0
3  A12LFM1HZ4JFHI  B003Y49R7G      3.0
4   A33KZ4CI67801  B0053F8D1U      1.0
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
RMSE: 1.1299
MAE:  0.8305
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12


In [10]:
print("Average RSME: {}\nAverage MAE: {}".format( sum(rmse) / len(rmse), sum(mae) / len(mae) ))

Average RSME: 1.1303531586206228
Average MAE: 0.8312542012217421
