In [47]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

users = pd.read_table('ml-1m/users.dat',  sep = '::', engine = 'python', header = None, names= ['ID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
ratings = pd.read_table('ml-1m/ratings.dat',  sep = '::', engine = 'python', header = None, names= ['UserID', 'MovieID', 'Rating', 'Timestamp'])
#movies = pd.read_table('ml-1m/movies.dat',  sep = '::', header = None, names= ['ID', 'Title', 'Genre'])

In [48]:
kf = KFold(n_splits = 5)
for train, test in kf.split(ratings):
    print(train, test)

[ 200042  200043  200044 ... 1000206 1000207 1000208] [     0      1      2 ... 200039 200040 200041]
[      0       1       2 ... 1000206 1000207 1000208] [200042 200043 200044 ... 400081 400082 400083]
[      0       1       2 ... 1000206 1000207 1000208] [400084 400085 400086 ... 600123 600124 600125]
[      0       1       2 ... 1000206 1000207 1000208] [600126 600127 600128 ... 800165 800166 800167]
[     0      1      2 ... 800165 800166 800167] [ 800168  800169  800170 ... 1000206 1000207 1000208]


In [None]:
rmseglobalavg = []
maeglobalavg = []
rmseuseravg = []
maeuseravg = []
rmseitemavg = []
maeitemavg = []

for trainid, testid in kf.split(ratings):
    
    X_train = ratings[['UserID','MovieID']].iloc[trainid]
    X_test = ratings[['UserID','MovieID']].iloc[testid]
    y_train = ratings['Rating'].iloc[trainid]
    y_test = ratings['Rating'].iloc[testid].to_numpy()[:500]
    
    global_avg = ratings['Rating'].iloc[trainid].mean()
    user_avg = ratings[['UserID','Rating']].iloc[trainid].groupby(['UserID']).mean()
    item_avg = ratings[['MovieID','Rating']].iloc[trainid].groupby(['MovieID']).mean()
 
    global_avg_pred = []
    user_avg_pred = []
    item_avg_pred = []
    
    for x in testid:
       
        # predicting using global average
        global_avg_pred.append(global_avg)
    
        # predicting using user average
        if ratings['UserID'][x] in user_avg.index.tolist():
            user_rating = user_avg.loc[ratings['UserID'][x]]
            user_avg_pred.append(user_rating[0])
        else: user_avg_pred.append(global_avg)
        
        # predicting using item average
        if ratings['MovieID'][x] in item_avg.index.tolist():
            item_rating = item_avg.loc[ratings['MovieID'][x]]
            item_avg_pred.append(item_rating[0])
        else: item_avg_pred.append(global_avg)

    rmseglobalavg.append(math.sqrt(mean_squared_error(y_test, global_avg_pred)))
    maeglobalavg.append(mean_absolute_error(y_test, global_avg_pred))
    
    rmseuseravg.append(math.sqrt(mean_squared_error(y_test,user_avg_pred)))
    maeuseravg.append(mean_absolute_error(y_test, user_avg_pred))
    
    rmseitemavg.append(math.sqrt(mean_squared_error(y_test, item_avg_pred)))
    maeitemavg.append(mean_absolute_error(y_test, item_avg_pred))

print('global rmse: ', sum(rmseglobalavg)/len(rmseglobalavg))
print('global mae: ', sum(maeglobalavg)/len(maeglobalavg))
print('user rmse: ', sum(rmseuseravg)/len(rmseuseravg))
print('user mae: ', sum(maeuseravg)/len(maeuseravg))
print('item rmse: ', sum(rmseitemavg)/len(rmseitemavg))
print('item mae: ', sum(rmseitemavg)/len(maeitemavg))


In [54]:
for trainid, testid in kf.split(ratings):
    
    X_train = ratings[['UserID','MovieID']].iloc[trainid]
    X_test = ratings[['UserID','MovieID']].iloc[testid]
    y_train = ratings['Rating'].iloc[trainid]
    y_test = ratings['Rating'].iloc[testid]
    
    global_avg = ratings['Rating'][trainid].mean()
    user_avg = ratings[['UserID','Rating']].iloc[trainid].groupby(['UserID']).mean()
    item_avg = ratings[['MovieID','Rating']].iloc[trainid].groupby(['MovieID']).mean()
    
    # predicting using alpha*user + beta*item
    model1 = LinearRegression(fit_intercept = False)
    model1.fit(X_train, y_train)
    pred1 = model1.predict(X_test)
    
    # predicting using alpha*user + beta*item + gamma
    model2 = LinearRegression()
    model2.fit(X_train, y_train)
    pred2 = model2.predict(X_test)
    
    print('model 1 rmse: ', (math.sqrt(mean_squared_error(y_test, pred1))))
    print('model 1 mae: ', (mean_absolute_error(y_test, pred1)))
    print('model 2 rmse: ', (math.sqrt(mean_squared_error(y_test, pred2))))
    print('model 2 mae: ', (mean_absolute_error(y_test, pred2)))
 

model 1 rmse:  2.594056502010988
model 1 mae:  2.308893299827645
model 2 rmse:  1.121259740322577
model 2 mae:  0.9381919125418448
model 1 rmse:  1.9008323938990948
model 1 mae:  1.5939250451705749
model 2 rmse:  1.1256128891853572
model 2 mae:  0.93902034588881
model 1 rmse:  1.613385366181772
model 1 mae:  1.3315984136700838
model 2 rmse:  1.1066897781565865
model 2 mae:  0.9242160160918663
model 1 rmse:  1.5182692200456613
model 1 mae:  1.2333623778367
model 2 rmse:  1.108556981531058
model 2 mae:  0.9222857710619399
model 1 rmse:  2.006546549265877
model 1 mae:  1.608465803222887
model 2 rmse:  1.1125908096197705
model 2 mae:  0.9309916459805792
