In [7]:
%matplotlib inline
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
import surprise
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# k-NN Baseline

### Movie-movie similarities

In [1]:
from surprise.model_selection import GridSearchCV
from surprise import KNNBaseline
from surprise import Dataset
from surprise import Reader

# path to dataset file
file_path = "../data/data_surprise.csv"

reader = Reader(line_format='user item rating', sep=',', skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)

# Using ALS
print('KNN Baseline')
param_grid = {'k': [20, 40, 60],
              'bsl_options': {'method': ['sgd']},
              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [2, 3, 4, 5],
                              'user_based': [False],
                              'shrinkage' : [100,110,120] }
              }
                              
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

KNN Baseline
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearso

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
1.0089916625867201
{'k': 60, 'bsl_options': {'method': 'sgd'}, 'sim_options': {'name': 'pearson_baseline', 'min_support': 2, 'user_based': False, 'shrinkage': 120}}


In [None]:
from surprise.model_selection import train_test_split
from surprise import accuracy

trainset, testset = train_test_split(data, test_size=.10)

algo = gs.best_estimator['rmse']

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

### User-user similarities

In [4]:
from surprise.model_selection import GridSearchCV
from surprise import KNNBaseline
from surprise import Dataset
from surprise import Reader

# path to dataset file
file_path = "../data/data_surprise.csv"
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)

# Using ALS
print('KNN Baseline')
param_grid = {'k': [400],
              'bsl_options': {'method': ['als'],
                             'n_epochs': [20],
                             'reg_u': [15],
                             'reg_i': [0.01]},
              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [1],
                              'user_based': [True],
                              'shrinkage' : [100]}
              }
                              
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

KNN Baseline
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
0.9989276162454241
{'k': 380, 'bsl_options': {'method': 'als', 'n_epochs': 20, 'reg_u': 15, 'reg_i': 0.01}, 'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': True, 'shrinkage': 100}}


### Ratings that have to be predicted (items and users specified in sample_submission file)

In [8]:
from data_helpers import read_csv_sample

INPUT_PATH = "../data/sample_submission.csv"
ids = read_csv_sample(INPUT_PATH)

In [10]:
trainset = data.build_full_trainset()

bsl_options={'method':'als',
             'n_epochs':50,
             'reg_u':15,
             'reg_i':0.01}
sim_options={'name':'pearson_baseline',
              'min_support':1,
              'user_based':True,
              'shrinkage':100}
              
algo = KNNBaseline(k=400, bsl_options=bsl_options, sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x173bf848>

In [11]:
#to predict the ids
predictions = []

for i in range(len(ids[0])):
    pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
    predictions.append(round(pred.est))
    
print(len(predictions))

1176952


### Save output for submission

In [12]:
from data_helpers import create_csv_submission

OUTPUT_PATH = "../data/submissionKNNBaseline.csv"
create_csv_submission(ids, predictions, OUTPUT_PATH)
print("File submission.csv ready to be submitted !")

File submission.csv ready to be submitted !
