In [6]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import random
from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# naive collaborative filter
class NaiveCollaborativeFilter(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        # Compute rating means of each user
        
        self.trainset = trainset
        self.the_means = {}
        for key in self.trainset.ur:
            urs = self.trainset.ur[key]
            mean = np.mean([r for (_, r) in urs])
            self.the_means[key] = mean

        return self

    def estimate(self, u, i):
        if self.the_means.__contains__(u):
            return self.the_means[u]
        else:
            return 3


# calculate RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# read csv file
file_path = os.path.expanduser('ratings.csv')
df = pd.read_csv(file_path)
del df['timestamp']

# shuffle data
df = df.sample(frac=1).reset_index(drop=True)

# 10 fold cross validation
def cross_validation(dataset):
    testlen = int(len(df) / 10)
    total_rmse = 0
    print(dataset + ':')
    for i in range(10):
        # split data into 10 pieces
        sp = int(len(df) / 10 * i)
        df1 = df[0:sp]
        df2 = df[sp:sp+testlen]
        df3 = df[sp+testlen:]

        traindf = pd.concat([df1,df3],ignore_index=True)
        reader1 = Reader(rating_scale=(0, 5))
        trainset = Dataset.load_from_df(traindf[['userId', 'movieId', 'rating']], reader1)

        alg = NaiveCollaborativeFilter()
        alg.fit(trainset.build_full_trainset())

        if dataset == "Popular Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] > 2]

        if dataset == "Unpopular Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] <= 2]

        if dataset == "High Variance Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] >= 5]
            df2['var'] = df2['rating'].groupby(df2['movieId']).transform(lambda arr:np.mean((arr - arr.mean()) ** 2))
            df2 = df2[df2['var'] >= 2]
    
        reader2 = Reader(rating_scale=(0, 5))
        testset = Dataset.load_from_df(df2[['userId', 'movieId', 'rating']], reader2)
        testset = [(u, i, r) for (u, i, r) in testset.build_full_trainset().all_ratings()]

        prediction = alg.test(testset)
    
        real = []
        est = []

        for j in range(len(prediction)):
            if not prediction[j][4]['was_impossible']:
                real.append(prediction[j][2])
                est.append(prediction[j][3])
    
        cur_rmse = rmse(np.array(real), np.array(est))
        total_rmse += cur_rmse
        print(str(i) + 'th rmse: ' + str(cur_rmse))

    final_rmse = total_rmse / 10
    print('final_rmse: ' + str(final_rmse))
    print('')


# Q30
cross_validation("MovieLens Dataset")

# Q31
cross_validation("Popular Movies Dataset")

# Q32
cross_validation("Unpopular Movies Dataset")

# Q33
cross_validation("High Variance Movies Dataset")



MovieLens Dataset:
0th rmse: 1.15319463737
1th rmse: 1.14389480659
2th rmse: 1.15734477245
3th rmse: 1.21532744424
4th rmse: 1.15294208088
5th rmse: 1.17703937777
6th rmse: 1.1478984003
7th rmse: 1.16861599925
8th rmse: 1.15323474304
9th rmse: 1.18447609394
final_rmse: 1.16539683558

Popular Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.11389432345
1th rmse: 1.11899046084
2th rmse: 1.13005495014
3th rmse: 1.14275835467
4th rmse: 1.12081443161
5th rmse: 1.12930641032
6th rmse: 1.11669462453
7th rmse: 1.16207680673
8th rmse: 1.12030956882
9th rmse: 1.13291330696
final_rmse: 1.12878132381

Unpopular Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.26192887263
1th rmse: 1.24698972759
2th rmse: 1.21769804237
3th rmse: 1.2375469453
4th rmse: 1.27804932611
5th rmse: 1.20416857056
6th rmse: 1.19178394501
7th rmse: 1.22423543439
8th rmse: 1.29173062405
9th rmse: 1.24154045264
final_rmse: 1.23956719407

High Variance Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.648106986
1th rmse: 1.73466275699
2th rmse: 1.6322325272
3th rmse: 1.81732086691
4th rmse: 1.75057562058
5th rmse: 1.62791123059
6th rmse: 1.5612770346
7th rmse: 1.71355764988
8th rmse: 1.70521625862
9th rmse: 1.73167886385
final_rmse: 1.69225397952

