<a href="https://colab.research.google.com/github/Ruruthia/Boardgames-recommending-system/blob/master/Tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import pickle
import os

import pandas as pd
import numpy as np

from surprise import SVD
from surprise import NMF
from surprise import SlopeOne
from surprise import Reader
from surprise.dataset import DatasetAutoFolds

from tqdm.auto import tqdm

from time import time

In [2]:
from tools import testing

# Preparing dataset

In [3]:
ratings_df = pd.read_csv('./data/ratings.csv.gz', compression='gzip')[['bgg_user_name', 'bgg_id', 'bgg_user_rating']].dropna(how='any')

In [4]:
ratings_df

Unnamed: 0,bgg_user_name,bgg_id,bgg_user_rating
0,mycroft,5,8.0
1,mycroft,13,9.0
8,mycroft,278,6.5
13,mycroft,823,7.0
25,mycroft,3284,4.0
...,...,...,...
22855491,æleksandr þræð,2223,8.0
22855492,æleksandr þræð,2399,7.0
22855493,æleksandr þræð,2932,6.0
22855494,æleksandr þræð,5451,8.0


In [5]:
ratings_df = ratings_df[ratings_df['bgg_user_rating']>=1]

In [6]:
train_df, test_df = testing.split_ratings_dataset(ratings_df, seed=42)

In [7]:
test_known, test_unknown = testing.split_testing_set(test_df, seed=42)

In [8]:
games_df = pd.read_json('./data/bgg_GameItem.jl', lines = True)

In [9]:
def get_errors(model, test_df):
    errors = []
    for _, rating in test_df.iterrows():
        est = model.predict(uid=rating['bgg_user_name'], iid=rating['bgg_id'])[3]
        err = est - rating['bgg_user_rating']
        errors.append(err)
    return np.array(errors)


def MSE(errors):
    return np.mean(errors**2)


def RMSE(errors):
    return np.sqrt(MSE(errors))


In [10]:
def get_top_n(model, trainset, n=20, it=None):
    top_n = []
    if it == None:
        it = range(trainset.n_users)
    for u in tqdm(it):
        user_items = np.array(trainset.ur[u], dtype=int)[:, 0]
        items = np.setdiff1d(np.arange(trainset.n_items), user_items, assume_unique=True)
        user_anti_testset = [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), 0) for
                             i in items]
        predictions = model.test(user_anti_testset)
        users_top_n = []
        for uid, iid, _, est, _ in predictions:
            users_top_n.append((uid, iid, est))
        users_top_n.sort(key=lambda x: x[2], reverse=True)
        
        top_n += users_top_n[:n]
        
    top_n_df = pd.DataFrame(top_n)
    top_n_df.columns = ['bgg_user_name', 'bgg_id', 'estimate']
    
    return top_n_df

In [11]:
reader = Reader(rating_scale=(1,10))
trainset = DatasetAutoFolds.load_from_df(train_df.append(test_known), reader).build_full_trainset()

# Tests

## SVD

In [12]:
algo = SVD()

In [13]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb919f4a0d0>

### Errors on test set

In [14]:
errors = get_errors(algo, test_unknown)

In [15]:
print(MSE(errors))
print(RMSE(errors))

1.402414273730857
1.184235734020409


### Coverage / diversity

In [19]:
top_10_df = get_top_n(algo, trainset, n=10, it=range(0, trainset.n_users//10))

  0%|          | 0/24304 [00:00<?, ?it/s]

In [29]:
top_10_df.to_csv('./data/svd_top_10.csv', header=True, index=False)

In [14]:
testing.coverage(top_10_df)

1594

In [15]:
testing.diversity(top_10_df, games_df)

{'category': 19.997819289005925, 'mechanic': 41.60479756418697}

# NMF

In [16]:
algo = NMF()

In [17]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f648a4faa00>

### Errors on test set

In [18]:
errors = get_errors(algo, test_unknown)

In [19]:
print(MSE(errors))
print(RMSE(errors))

3.5376357412922927
1.8808603726200126


### Coverage / diversity

In [20]:
top_10_df = get_top_n(algo, trainset, n=10, it=range(0, trainset.n_users//10))

  0%|          | 0/24304 [00:00<?, ?it/s]

In [23]:
top_10_df.to_csv('./data/nmf_top_10.csv', header=True, index=False)

In [24]:
testing.coverage(top_10_df)

586

In [25]:
testing.diversity(top_10_df, games_df)

{'category': 19.869033903884134, 'mechanic': 40.18832290980909}

## Slope One

In [12]:
algo = SlopeOne()

In [13]:
algo.fit(trainset)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algo.fit(trainset)


<surprise.prediction_algorithms.slope_one.SlopeOne at 0x7f00649694f0>

### Errors on test set

In [15]:
errors = get_errors(algo, test_unknown)

In [16]:
print(MSE(errors))
print(RMSE(errors))

1.470237539077332
1.2125335208056442


### Coverage / diversity

In [14]:
top_10_df = get_top_n(algo, trainset, n=10, it=range(0, trainset.n_users//10))

  0%|          | 0/24304 [00:00<?, ?it/s]

In [15]:
top_10_df.to_csv('./data/slope_one_top_10.csv', header=True, index=False)

In [16]:
testing.coverage(top_10_df)

874

In [17]:
testing.diversity(top_10_df, games_df)

{'category': 16.71704246214615, 'mechanic': 33.20276497695853}