In [37]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import io
import numpy as np
import pandas as pd
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import math
import cPickle
import os
import json
import pickle
from surprise import Dataset, evaluate, accuracy, print_perf
from surprise import KNNBasic
%matplotlib inline
import random
random.seed(0)
# Force matplotlib to not use any Xwindows backend.

In [2]:
# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = Dataset.load_builtin("ml-100k")
trainingSet = data.build_full_trainset()

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [3]:
sim_options = {'name': 'cosine','user_based': False}
knn = KNNBasic(sim_options=sim_options)

In [4]:
knn.train(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = (os.path.expanduser('~') +
                 '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

In [6]:
# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

In [13]:
# Retieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = knn.trainset.to_inner_iid(toy_story_raw_id)

In [23]:
# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = knn.get_neighbors(toy_story_inner_id, k=10)

In [24]:
# Convert inner ids of the neighbors into names.
toy_story_neighbors = (knn.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

In [25]:
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)


The 10 nearest neighbors of Toy Story are:
So Dear to My Heart (1949)
My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993)
Somebody to Love (1994)
Crows and Sparrows (1949)
Total Eclipse (1995)
Mr. Jones (1993)
Convent, The (Convento, O) (1995)
Incognito (1997)
Every Other Weekend (1990)
Homage (1995)


In [29]:
# Evaluate performances of our algorithm on the dataset.
data.split(n_folds=3)
perf = evaluate(knn, data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0362
MAE:  0.8211
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0374
MAE:  0.8248
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0346
MAE:  0.8220
------------
------------
Mean RMSE: 1.0361
Mean MAE : 0.8226
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
MAE     0.8211  0.8248  0.8220  0.8226  
RMSE    1.0362  1.0374  1.0346  1.0361  


In [48]:
average_rmse = 0
count_folds = 0
for trainset, testset in data.folds():
    count_folds+=1

    # train and test algorithm.
    knn.train(trainset)
    predictions = knn.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    average_rmse=((count_folds-1)*average_rmse+rmse)/count_folds
print(average_rmse)    

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0362
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0374
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0346
1.03608180578


In [50]:
stats = sim_options.copy()
stats['rmse'] = average_rmse

In [51]:
model_filename = os.path.join(os.environ['OUTPUT_DIR'],'model.dat')
pickle.dump(knn, open(model_filename, 'wb'))
stats_filename = os.path.join(os.environ['OUTPUT_DIR'],'stats.json')
with open(stats_filename, 'wb') as f:
    f.write(json.dumps(stats))