In [1]:
import pandas as pd
import numpy as np
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import KFold, cross_validate
from tabulate import tabulate

# Read and load data

In [2]:
# Read both the datasets from the files using pandas
movielens_df = pd.read_csv("../data/u.data", sep="\t", header=None)
movielens_df.columns = ["userID", "itemID", "rating", "timestamp"]
pda_df = pd.read_csv("../data/train-PDA2018.csv", sep=",")
print(movielens_df.head())
print("\n\n")
print(pda_df.head())

   userID  itemID  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596



   userID  itemID  rating  timeStamp
0       5     648       5  978297876
1       5    1394       5  978298237
2       5    3534       5  978297149
3       5     104       4  978298558
4       5    2735       5  978297919


In [3]:
# Create the training datasets using Surprise's reader class
reader = Reader(rating_scale=(1,5)) # We have ratings from 1 to 5 so we create the rating scale

# Load the data from the dataframes
movielens_dataset = Dataset.load_from_df(movielens_df.iloc[:,0:3], reader)
pda_dataset = Dataset.load_from_df(pda_df.iloc[:,0:3], reader)

# Build full trainsets to print out the data loaded above
mls_train = movielens_dataset.build_full_trainset()
pda_train = pda_dataset.build_full_trainset()

# Print out some basic information about the datasets
print("General information on the training sets we will be using \n")
print("1) Number of items in each dataset", " ML100k:", mls_train.n_items, "PDA:", pda_train.n_items)
print("2) Number of users in each dataset", " ML100k:", mls_train.n_users, "PDA:", pda_train.n_users)
print("3) Number of ratings in each dataset", " ML100k:", mls_train.n_ratings, "PDA:", pda_train.n_ratings)
print("4) Mean rating", " ML100k:", mls_train.global_mean, "PDA:", pda_train.global_mean)

General information on the training sets we will be using 

1) Number of items in each dataset  ML100k: 1682 PDA: 1824
2) Number of users in each dataset  ML100k: 943 PDA: 5690
3) Number of ratings in each dataset  ML100k: 100000 PDA: 470711
4) Mean rating  ML100k: 3.52986 PDA: 3.638361967321775


# KNN Collaborative Filtering 

In [4]:
# List that will contain the RMSE and MAE results
results_table = []
# Define number of k splits for cross validation
kf = KFold(n_splits=5) 
# Define the two different options that will be used: User-based and Item-based similarity
knn_titles = ("ubKNN", "ibKNN")
ub_options = {'name': 'cosine', 'user_based': True}
ib_options = {'name': 'cosine', 'user_based': False}
knn_sim_options = {"ubKNN": ub_options , "ibKNN": ib_options}

In [5]:
# Run 5 fold cross validation to see how the user-based and item-based KNN cf algorithms will perform
# We need to do this because we can't separate the two algorithms when using the GridSearchCv function
# Hence we run the cross validation for both algorithms manually and then save the respective results

## RUN 1: MOVIELENS
print("Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...")
fold_n = 0
# Run 5 fold cross validation
for title in knn_titles:    
    out = cross_validate(KNNBasic(k=50, min_k=1, sim_options=knn_sim_options[title]), movielens_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = ["ML100-"+title, mean_rmse, mean_mae]
    results_table.append(new_line)
    fold_n += 1

Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [6]:
## RUN 1: PDA2018
print("Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...")
fold_n = 0
# Run 5 fold cross validation
for title in knn_titles:    
    out = cross_validate(KNNBasic(k=50, min_k=1, sim_options=knn_sim_options[title]), pda_dataset, ["rmse", "mae"], kf)
    mean_rmse = '{:.3f}.'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}.'.format(np.mean(out['test_mae']))
    new_line = ["PDA-"+title, mean_rmse, mean_mae]
    results_table.append(new_line)
    fold_n += 1

Running 5-fold cross validation with Random Predictor on MovieLens-100K dataset ...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [8]:
# Display results of running the algorithms
results_table_headers = ["Recommender", "RMSE", "MAE"]
print(tabulate(results_table, results_table_headers, tablefmt="pipe"))

| Recommender   | RMSE   | MAE    |
|:--------------|:-------|:-------|
| ML100-ubKNN   | 1.016. | 0.804. |
| ML100-ibKNN   | 1.025. | 0.810. |
| PDA-ubKNN     | 0.974. | 0.765. |
| PDA-ibKNN     | 0.995. | 0.776. |


In [9]:
# Export the results to a csv file
results_df = pd.DataFrame(results_table, columns=["Recommender", "RMSE", "MAE"])
results_df.to_csv("../data/knn_algorithms_results.csv")