In [1]:
import sklearn
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import multiprocessing

In [2]:
path = "./ml-1m/ratings.dat" #read the dataset
table = pd.read_table(path, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], 
                                   engine="python")
table

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
def row_col_mean(row_col, index,dataset): #average of the row and col in the dataset
    if row_col == "row":
        return(dataset.loc[(dataset["UserID"]==index), "Rating"].mean())
    else:
        return(dataset.loc[(dataset["MovieID"]==index), "Rating"].mean())

In [4]:
def DataRouter(dataset): 
    """
    data preprocess for linear regression
    dataset: train or test, pd dataframe
    return: X, Y for linear regression
    """
    user_mean = dict()
    movie_mean = dict()
    for i in dataset['UserID'].unique():
        user_mean[i] = row_col_mean("row", i, dataset)
    for i in dataset['MovieID'].unique():
        movie_mean[i] = row_col_mean("col", i, dataset)
    
    dataset['X1'] = '' 
    dataset['X2'] = ''

    for index, row in dataset.iterrows():
        dataset.loc[index, 'X1'] = user_mean[row['UserID']]
        dataset.loc[index, 'X2'] = movie_mean[row['MovieID']]
    X = dataset.loc[:,['X1','X2']].values
    Y = dataset.loc[:,['Rating']].values  
    return(X, Y)

In [5]:
def LinearReg(X, Y):
    reg = LinearRegression().fit(X, Y)
    coef = reg.coef_
    intercept = reg.intercept_
    return(coef, intercept)

In [6]:
def LinearRegNoIntercept(X, Y):
    reg = LinearRegression(fit_intercept=False).fit(X, Y)
    coef = reg.coef_
    intercept = reg.intercept_
    return(coef, intercept)

In [7]:
def evaluation(X, Y, coef, intercept): 
    """
    evaluation for linear regression
    """
    reg = LinearRegression().fit(X, Y)
    reg.coef_ = coef
    reg.intercept_ = intercept
    
    y_pred = np.array(reg.predict(X))
    y_pred = np.where(y_pred < 1, 1, y_pred)
    y_pred = np.where(y_pred > 5, 5, y_pred)

    rmse = np.sqrt(np.mean((Y - y_pred) ** 2))
    mae = np.mean(np.abs(Y - y_pred))
    return(rmse,mae)

In [8]:
#Kfold, test and train are set
kf = KFold(n_splits=5, shuffle=True, random_state=1) 

In [9]:
#Liniear regression of two averages with intercept
test_rmse_list = []
test_mae_list = []
train_rmse_list = []
train_mae_list = []
for train_index, test_index in kf.split(table):
    train = table.loc[train_index.tolist(), ["UserID", "MovieID", "Rating"]]
    test = table.loc[test_index.tolist(), ["UserID", "MovieID", "Rating"]]
    
    X_train, Y_train = DataRouter(train)
    X_test, Y_test = DataRouter(test)
    coef, interc = LinearReg(X_train, Y_train)
    test_rmse, test_mae = evaluation(X_test, Y_test, coef, interc)
    train_rmse, train_mae = evaluation(X_train, Y_train, coef, interc)
    test_rmse_list.append(test_rmse)
    test_mae_list.append(test_mae)
    train_rmse_list.append(train_rmse)
    train_mae_list.append(train_mae)
print(np.mean(test_rmse_list),np.mean(test_mae_list))
print(np.mean(train_rmse_list),np.mean(train_mae_list))

0.9001653681660222 0.7122092602307395
0.9145443262957137 0.7247995102650913


In [10]:
#Liniear regression of two averages without intercept
test_rmse_list = []
test_mae_list = []
train_rmse_list = []
train_mae_list = []
for train_index, test_index in kf.split(table):
    train = table.loc[train_index.tolist(), ["UserID", "MovieID", "Rating"]]
    test = table.loc[test_index.tolist(), ["UserID", "MovieID", "Rating"]]
    
    X_train, Y_train = DataRouter(train)
    X_test, Y_test = DataRouter(test)
    coef, interc = LinearRegNoIntercept(X_train, Y_train)
    test_rmse, test_mae = evaluation(X_test, Y_test, coef, interc)
    train_rmse, train_mae = evaluation(X_train, Y_train, coef, interc)
    test_rmse_list.append(test_rmse)
    test_mae_list.append(test_mae)
    train_rmse_list.append(train_rmse)
    train_mae_list.append(train_mae)
print(np.mean(test_rmse_list),np.mean(test_mae_list))
print(np.mean(train_rmse_list),np.mean(train_mae_list))

0.9344570734200918 0.748706056925681
0.9465499537839577 0.7585512269178818


In [11]:
def global_avg(dataset): #calculate Global average rating
    ga = dataset['Rating'].mean()
    return(ga)

In [12]:
test_rmse_list = []
test_mae_list = []
train_rmse_list = []
train_mae_list = []
for train_index, test_index in kf.split(table):
    train = table.loc[train_index.tolist(), ["UserID", "MovieID", "Rating"]]
    test = table.loc[test_index.tolist(), ["UserID", "MovieID", "Rating"]]
    
    test_rmse = np.sqrt(np.mean((test['Rating'] - global_avg(test))**2))
    test_mae = np.mean(np.abs(test['Rating']- global_avg(test)))
    train_rmse = np.sqrt(np.mean((train['Rating'] - global_avg(train))**2))
    train_mae = np.mean(np.abs(train['Rating'] - global_avg(train)))

    test_rmse_list.append(test_rmse)
    test_mae_list.append(test_mae)
    train_rmse_list.append(train_rmse)
    train_mae_list.append(train_mae)
print(np.mean(test_rmse_list),np.mean(test_mae_list))
print(np.mean(train_rmse_list),np.mean(train_mae_list))

1.1170984720424795 0.9338595477225654
1.1171011110023854 0.9338607806758029


In [13]:
#average rating per user
test_rmse_list = []
test_mae_list = []
train_rmse_list = []
train_mae_list = []
for train_index, test_index in kf.split(table):
    train = table.loc[train_index.tolist(), ["UserID", "MovieID", "Rating"]]
    test = table.loc[test_index.tolist(), ["UserID", "MovieID", "Rating"]]
    
    user_avg = train[['UserID','Rating']].groupby('UserID').mean().renmae(columns={'Rating':'Average'})
    train_merge = pd.merge(train, user_avg, on='UserID')
    test_merge = pd.merge(test, user_avg, on='UserID')

    test_rmse = np.sqrt(np.mean((test_merge['Rating'] - test_merge['Average'])**2))
    test_mae = np.mean(np.abs(test_merge['Rating']- test_merge['Average']))
    train_rmse = np.sqrt(np.mean((train_merge['Rating'] - train_merge['Average'])**2))
    train_mae = np.mean(np.abs(train_merge['Rating'] - train_merge['Average']))

    test_rmse_list.append(test_rmse)
    test_mae_list.append(test_mae)
    train_rmse_list.append(train_rmse)
    train_mae_list.append(train_mae)
print(np.mean(test_rmse_list),np.mean(test_mae_list))
print(np.mean(train_rmse_list),np.mean(train_mae_list))

1.0354887413559504 0.8290076950378905
1.0276718866687955 0.8227317798294085


In [14]:
#average rating per item
test_rmse_list = []
test_mae_list = []
train_rmse_list = []
train_mae_list = []
for train_index, test_index in kf.split(table):
    train = table.loc[train_index.tolist(), ["UserID", "MovieID", "Rating"]]
    test = table.loc[test_index.tolist(), ["UserID", "MovieID", "Rating"]]
    
    movie_avg = train[['MovieID','Rating']].groupby('MovieID').mean().renmae(columns={'Rating':'Average'})
    train_merge = pd.merge(train, movie_avg, on='MovieID')
    test_merge = pd.merge(test, movie_avg, on='MovieID')

    test_rmse = np.sqrt(np.mean((test_merge['Rating'] - test_merge['Average'])**2))
    test_mae = np.mean(np.abs(test_merge['Rating']- test_merge['Average']))
    train_rmse = np.sqrt(np.mean((train_merge['Rating'] - train_merge['Average'])**2))
    train_mae = np.mean(np.abs(train_merge['Rating'] - train_merge['Average']))

    test_rmse_list.append(test_rmse)
    test_mae_list.append(test_mae)
    train_rmse_list.append(train_rmse)
    train_mae_list.append(train_mae)
print(np.mean(test_rmse_list),np.mean(test_mae_list))
print(np.mean(train_rmse_list),np.mean(train_mae_list))

0.9794200889294983 0.782308674322367
0.9742112263767705 0.7783430056529332
