In [7]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import random
from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# naive collaborative filter
class NaiveCollaborativeFilter(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        # Compute rating means of each user
        
        self.trainset = trainset
        self.the_means = {}
        for key in self.trainset.ur:
            urs = self.trainset.ur[key]
            mean = np.mean([r for (_, r) in urs])
            self.the_means[key] = mean

        return self

    def estimate(self, u, i):
        if self.the_means.__contains__(u):
            return self.the_means[u]
        else:
            return 0


# calculate RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# read csv file
file_path = os.path.expanduser('ratings.csv')
df = pd.read_csv(file_path)
del df['timestamp']

# shuffle data
df = df.sample(frac=1).reset_index(drop=True)

# 10 fold cross validation
def cross_validation(dataset):
    testlen = int(len(df) / 10)
    total_rmse = 0
    print(dataset + ':')
    for i in range(10):
        # split data into 10 pieces
        sp = int(len(df) / 10 * i)
        df1 = df[0:sp]
        df2 = df[sp:sp+testlen]
        df3 = df[sp+testlen:]

        traindf = pd.concat([df1,df3],ignore_index=True)
        reader1 = Reader(rating_scale=(0, 5))
        trainset = Dataset.load_from_df(traindf[['userId', 'movieId', 'rating']], reader1)

        alg = NaiveCollaborativeFilter()
        alg.fit(trainset.build_full_trainset())

        if dataset == "Popular Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] > 2]

        if dataset == "Unpopular Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] <= 2]

        if dataset == "High Variance Movies Dataset":
            df2['size'] = df2.groupby(['movieId']).movieId.transform(np.size)
            df2 = df2[df2['size'] >= 5]
            df2['var'] = df2['rating'].groupby(df2['movieId']).transform(lambda arr:np.mean((arr - arr.mean()) ** 2))
            df2 = df2[df2['var'] >= 2]
    
        reader2 = Reader(rating_scale=(0, 5))
        testset = Dataset.load_from_df(df2[['userId', 'movieId', 'rating']], reader2)
        testset = [(u, i, r) for (u, i, r) in testset.build_full_trainset().all_ratings()]

        prediction = alg.test(testset)
    
        real = []
        est = []

        # remove impossible estimation
        for j in range(len(prediction)):
            if prediction[j][3] > 0:
                real.append(prediction[j][2])
                est.append(prediction[j][3])
    
        cur_rmse = rmse(np.array(real), np.array(est))
        total_rmse += cur_rmse
        print(str(i) + 'th rmse: ' + str(cur_rmse))

    final_rmse = total_rmse / 10
    print('final_rmse: ' + str(final_rmse))
    print('')


# Q30
cross_validation("MovieLens Dataset")

# Q31
cross_validation("Popular Movies Dataset")

# Q32
cross_validation("Unpopular Movies Dataset")

# Q33
cross_validation("High Variance Movies Dataset")



MovieLens Dataset:
0th rmse: 1.15476712817
1th rmse: 1.15735623678
2th rmse: 1.16859816372
3th rmse: 1.1819369104
4th rmse: 1.12457714958
5th rmse: 1.22594269149
6th rmse: 1.17261584038
7th rmse: 1.16500733211
8th rmse: 1.19078396501
9th rmse: 1.13704874591
final_rmse: 1.16786341636

Popular Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.10594678538
1th rmse: 1.13424150769
2th rmse: 1.1151609141
3th rmse: 1.11840348601
4th rmse: 1.10617993931
5th rmse: 1.13109612316
6th rmse: 1.12801368843
7th rmse: 1.13889358829
8th rmse: 1.13915660282
9th rmse: 1.11556658922
final_rmse: 1.12326592244

Unpopular Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.24162702659
1th rmse: 1.25476428266
2th rmse: 1.22036336335
3th rmse: 1.20212493835
4th rmse: 1.22338399487
5th rmse: 1.26857436984
6th rmse: 1.23352417641
7th rmse: 1.25429788402
8th rmse: 1.30843702421
9th rmse: 1.24017573797
final_rmse: 1.24472727983

High Variance Movies Dataset:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0th rmse: 1.7361370555
1th rmse: 1.73029678427
2th rmse: 1.85507264054
3th rmse: 1.64731206701
4th rmse: 1.73093119366
5th rmse: 1.72607151083
6th rmse: 1.68902134972
7th rmse: 1.60922841224
8th rmse: 1.73997652062
9th rmse: 1.68453520706
final_rmse: 1.71485827414

