In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

ratings = pd.read_table('ml-1m/ratings.dat',  sep = '::', engine = 'python', header = None, names= ['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [2]:
kf = KFold(n_splits = 5)
for train, test in kf.split(ratings):
    print(train, test)

[ 200042  200043  200044 ... 1000206 1000207 1000208] [     0      1      2 ... 200039 200040 200041]
[      0       1       2 ... 1000206 1000207 1000208] [200042 200043 200044 ... 400081 400082 400083]
[      0       1       2 ... 1000206 1000207 1000208] [400084 400085 400086 ... 600123 600124 600125]
[      0       1       2 ... 1000206 1000207 1000208] [600126 600127 600128 ... 800165 800166 800167]
[     0      1      2 ... 800165 800166 800167] [ 800168  800169  800170 ... 1000206 1000207 1000208]


In [3]:
rmseglobalavg = []
maeglobalavg = []
rmseuseravg = []
maeuseravg = []
rmseitemavg = []
maeitemavg = []

for trainid, testid in kf.split(ratings):
    
    X_train = ratings[['UserID','MovieID']].iloc[trainid]
    X_test = ratings[['UserID','MovieID']].iloc[testid]
    y_train = ratings['Rating'].iloc[trainid]
    y_test = ratings['Rating'].iloc[testid]
    
    # define averages
    global_avg = y_train.mean()
    user_avg = ratings[['UserID','Rating']].iloc[trainid].groupby(['UserID']).mean()
    item_avg = ratings[['MovieID','Rating']].iloc[trainid].groupby(['MovieID']).mean()
 
    # create lists to put results in
    global_avg_pred = []
    user_avg_pred = []
    item_avg_pred = []
    
    # loop over test set
    for x in testid:
       
        # predict using global average
        global_avg_pred.append(global_avg)
    
        # predict using user average
        if ratings['UserID'][x] in user_avg.index.tolist():
            user_rating = user_avg.loc[ratings['UserID'][x]]
            user_avg_pred.append(user_rating[0])
        else: user_avg_pred.append(global_avg)
        
        # predict using item average
        if ratings['MovieID'][x] in item_avg.index.tolist():
            item_rating = item_avg.loc[ratings['MovieID'][x]]
            item_avg_pred.append(item_rating[0])
        else: item_avg_pred.append(global_avg)

    # append results to lists
    rmseglobalavg.append(math.sqrt(mean_squared_error(y_test, global_avg_pred)))
    maeglobalavg.append(mean_absolute_error(y_test, global_avg_pred))
    
    rmseuseravg.append(math.sqrt(mean_squared_error(y_test,user_avg_pred)))
    maeuseravg.append(mean_absolute_error(y_test, user_avg_pred))
    
    rmseitemavg.append(math.sqrt(mean_squared_error(y_test, item_avg_pred)))
    maeitemavg.append(mean_absolute_error(y_test, item_avg_pred))

# print averages of lists
print('global rmse: ', sum(rmseglobalavg)/len(rmseglobalavg))
print('global mae: ', sum(maeglobalavg)/len(maeglobalavg))
print('user rmse: ', sum(rmseuseravg)/len(rmseuseravg))
print('user mae: ', sum(maeuseravg)/len(maeuseravg))
print('item rmse: ', sum(rmseitemavg)/len(rmseitemavg))
print('item mae: ', sum(rmseitemavg)/len(maeitemavg))

global rmse:  1.117236564425969
global mae:  0.9339855110655619
user rmse:  1.1172260232723568
user mae:  0.9339513695616997
item rmse:  0.9800497988312646
item mae:  0.9800497988312646


In [4]:
# lists to put results in
model1rmseavg = []
model2rmseavg = []
model1maeavg = []
model2maeavg = []

for trainid, testid in kf.split(ratings):
    
    X_train = ratings[['UserID','MovieID']].iloc[trainid]
    X_test = ratings[['UserID','MovieID']].iloc[testid]
    y_train = ratings['Rating'].iloc[trainid]
    y_test = ratings['Rating'].iloc[testid]
    
    # predict using alpha*user + beta*item
    model1 = LinearRegression(fit_intercept = False)
    model1.fit(X_train, y_train)
    pred1 = model1.predict(X_test)
    
    # predict using alpha*user + beta*item + gamma
    model2 = LinearRegression()
    model2.fit(X_train, y_train)
    pred2 = model2.predict(X_test)
    
    # compute rmse and mae
    rmse1 = math.sqrt(mean_squared_error(y_test, pred1))
    mae1 = mean_absolute_error(y_test, pred1)
    rmse2 = math.sqrt(mean_squared_error(y_test, pred2))
    mae2 = mean_absolute_error(y_test, pred2)
    
    # add results to list
    model1rmseavg.append(rmse1)
    model2rmseavg.append(rmse2)
    model1maeavg.append(mae1)
    model2maeavg.append(mae2)
    
# print averages of lists
print('model 1 rmse: ', sum(model1rmseavg)/len(model1rmseavg))
print('model 1 mae: ', sum(model1maeavg)/len(model1maeavg))
print('model 2 rmse: ', sum(model2rmseavg)/len(model2rmseavg))
print('model 2 mae: ', sum(model2maeavg)/len(model2maeavg))

model 1 rmse:  1.9266180062806786
model 1 mae:  1.6152489879455783
model 2 rmse:  1.1149420397630698
model 2 mae:  0.930941138313008
