# Recommendation Systems

In [None]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
from sklearn import linear_model

In [None]:
# loading in the data
# adapted code from [1]

users_columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users = pd.read_table('users.dat', sep='::', header=None, names=users_columns, engine='python')

ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')

# merging dataframes
merged = pd.merge(pd.merge(ratings, users), movies)

In [None]:
# grabbing columns that we will use
trim = merged[['user_id', 'title','rating']]

## Naive Approaches

#### Naive Approach 1

"global average rating" algorithm

In [None]:
# code partially adapted from [2]

def na_global_average(df, k=5):

    # k-fold CV set-up
    kf = KFold(n_splits=k, shuffle=True)
    
    # initializing lists
    train_RMSE_per_round = []
    train_MAE_per_round = []
    test_RMSE_per_round = []
    test_MAE_per_round = []

    for train_index, test_index in kf.split(df):

        train = df['rating'][train_index]

        # global average rating in training data
        global_mean = np.mean(train)
        
        y_test = df['rating'][test_index]
        y_pred = [global_mean]*len(test_index)
        
        # train accuracy
        RMSE_train = math.sqrt(np.mean((train-global_mean)**2))
        MAE_train = np.mean(abs(train-global_mean))
        
        train_RMSE_per_round.append(RMSE_train)
        train_MAE_per_round.append(MAE_train)
        
        # test accuracy
        RMSE_test = math.sqrt(np.mean((y_pred-y_test)**2))
        MAE_test = np.mean(abs(y_pred-y_test))

        test_RMSE_per_round.append(RMSE_test)
        test_MAE_per_round.append(MAE_test)
    
    return(train_RMSE_per_round, train_MAE_per_round,
           test_RMSE_per_round, test_MAE_per_round)

In [None]:
# set seed
np.random.seed(seed=123)

# naive approach 1; 5-fold CV
experiment_1 = na_global_average(df=trim, k=5)

In [None]:
# results for the "global average rating" algorithm

# train accuracy; RMSE; round 1-5
print(np.round(experiment_1[0], 4))
# train accuracy; RMSE; estimate
print(np.round(np.mean(experiment_1[0]), 4))

# train accuracy; MAE; round 1-5
print(np.round(experiment_1[1], 4))
# train accuracy; MAE; estimate
print(np.round(np.mean(experiment_1[1]), 4))

# test accuracy; RSME; round 1-5
print(np.round(experiment_1[2], 4))
# test accuracy; RSME; estimate
print(np.round(np.mean(experiment_1[2]), 4))

# test accuracy; MAE; round 1-5
print(np.round(experiment_1[3], 4))
# test accuracy; MAE; estimate
print(np.round(np.mean(experiment_1[3]), 4))

#### Naive Approach 2

"movie average rating" algorithm

In [None]:
# code partially adapted from [2]

def na_movie_average(df, k):

    # setting up k-fold cross validation
    kf = KFold(n_splits=k, shuffle=True)

    # initializing lists
    train_RMSE_per_round = []
    train_MAE_per_round = []
    test_RMSE_per_round = []
    test_MAE_per_round = []

    for train_index, test_index in kf.split(df): # O(N)
        
        train = df.loc[train_index.tolist()]
    
        # average rating per title in training data
        title_means = train.pivot_table(values='rating', index='title', aggfunc='mean') # O(M)
        # change index 'title' into column 'title'
        title_means = title_means.reset_index()
        # change column name 'rating' into 'title_mean'
        title_means.rename(columns={'rating': 'title_mean'}, inplace=True)
    
        # add missing titles if missing
        if set(train.title.unique()) != set(df.title.unique()):
            # index of titles missing in training set
            mis_ind = np.invert(np.isin(element = df.title.unique(), test_elements = train.title.unique()))
            # missing title in training set
            mis_tit = df.title.unique()[mis_ind]
            # dataframe of missing titles in training set
            mis_tit = pd.DataFrame(data = mis_tit, columns = ['title'])
            # adding column with global mean rating as mean rating for missing title
            mis_tit.insert(loc = 1, column = 'title_mean', value = train.rating.mean())
            # adding missing titles to title_means in training data 
            title_means = pd.concat(objs = [title_means, mis_tit], axis = 0, ignore_index=True)
    
        # train accuracy
        xy_train = pd.merge(train, title_means)
        # train RMSE
        RMSE_train = math.sqrt(np.mean((xy_train['rating']-xy_train['title_mean'])**2))
        # train MAE
        MAE_train = np.mean(abs(xy_train['rating']-xy_train['title_mean']))
        
        train_RMSE_per_round.append(RMSE_train)
        train_MAE_per_round.append(MAE_train)
        
        # test accuracy
        x_test = df.loc[test_index.tolist()]
        # x and y test data
        xy_test = pd.merge(x_test, title_means) 
        # test RMSE
        RMSE_test = math.sqrt(np.mean((xy_test['rating'] - xy_test['title_mean'])**2))
        # test MAE
        MAE_test = np.mean(abs(xy_test['rating'] - xy_test['title_mean']))
        
        test_RMSE_per_round.append(RMSE_test)
        test_MAE_per_round.append(MAE_test)
    
    return(train_RMSE_per_round, train_MAE_per_round,
           test_RMSE_per_round, test_MAE_per_round)

In [None]:
# set seed
np.random.seed(seed=123)

# naive approach 2; 5-fold CV
experiment_2 = na_movie_average(df=trim, k=5)

In [None]:
# results for the "movie average rating" algorithm

# train accuracy; RMSE; round 1-5
print(np.round(experiment_2[0], 4))
# train accuracy; RMSE; estimate
print(np.round(np.mean(experiment_2[0]), 4))

# train accuracy; MAE; round 1-5
print(np.round(experiment_2[1], 4))
# train accuracy; MAE; estimate
print(np.round(np.mean(experiment_2[1]), 4))

# test accuracy; RMSE; round 1-5
print(np.round(experiment_2[2], 4))
# test accuracy; RMSE; estimate
print(np.round(np.mean(experiment_2[2]), 4))

# test accuracy; MAE; round 1-5
print(np.round(experiment_2[3], 4))
# test accuracy; MAE; estimate
print(np.round(np.mean(experiment_2[3]), 4))

#### Naive Approach 3

"user average rating" algorithm

In [None]:
# adapted code partially from [2]

def na_user_average(df, k):

    # setting up k-fold cross validation
    kf = KFold(n_splits=k, shuffle=True)
    
    # initializing lists
    train_RMSE_per_round = []
    train_MAE_per_round = []
    test_RMSE_per_round = []
    test_MAE_per_round = []

    for train_index, test_index in kf.split(df):

        train = df.loc[train_index.tolist()]

        # mean per user in training data
        user_means = train.pivot_table(values='rating', index='user_id', aggfunc='mean')
        # change index 'user_id' into column 'user_id' 
        user_means = user_means.reset_index()
        # change column name 'rating' into 'user_mean'
        user_means.rename(columns={'rating': 'user_mean'}, inplace=True) 

        # add missing users if missing
        if set(train.user_id.unique()) != set(df.user_id.unique()):
            # index of users missing in training set
            mis_ind = np.invert(np.isin(element = df.user_id.unique(), test_elements = train.user_id.unique()))
            # missing users in training set
            mis_use = df.user_id.unique()[mis_ind]
            # dataframe of missing users in training set
            mis_use = pd.DataFrame(data = mis_tit, columns = ['user_id'])
            # adding column with global mean rating as mean rating for user
            mis_use.insert(loc = 1, column = 'user_mean', value = train.rating.mean())
            # adding missing users to user_means in training data 
            user_means = pd.concat(objs = [user_means, mis_tit], axis = 0, ignore_index=True)
        
        # train accuracy
        xy_train = pd.merge(train, user_means)
        # train RMSE
        RMSE_train = math.sqrt(np.mean((xy_train['rating']-xy_train['user_mean'])**2))
        # train MAE
        MAE_train = np.mean(abs(xy_train['rating']-xy_train['user_mean']))
        
        train_RMSE_per_round.append(RMSE_train)
        train_MAE_per_round.append(MAE_train)
        
        # test accuracy
        x_test = df.loc[test_index.tolist()]
        # x and y test data
        xy_test = pd.merge(x_test, user_means) 
        # test RMSE
        RMSE_test = math.sqrt(np.mean((xy_test['rating']-xy_test['user_mean'])**2))
        # test MAE
        MAE_test = np.mean(abs(xy_test['rating']-xy_test['user_mean']))
        
        test_RMSE_per_round.append(RMSE_test)
        test_MAE_per_round.append(MAE_test)
        
    return(train_RMSE_per_round, train_MAE_per_round,
           test_RMSE_per_round, test_MAE_per_round)

In [None]:
# set seed
np.random.seed(seed=123)

# naive approach 3; 5-fold CV
experiment_3 = na_user_average(df=trim, k=5)

In [None]:
# results for the "user average rating" algorithm

# train accuracy; RMSE; round 1-5
print(np.round(experiment_3[0], 4))
# train accuracy; RMSE; estimate
print(np.round(np.mean(experiment_3[0]), 4))

# train accuracy; MAE; round 1-5
print(np.round(experiment_3[1], 4))
# train accuracy; MAE; estimate
print(np.round(np.mean(experiment_3[1]), 4))

# test accuracy; RSME; round 1-5
print(np.round(experiment_3[2], 4))
# test accuracy; RSME; estimate
print(np.round(np.mean(experiment_3[2]), 4))

# test accuracy; MAE; round 1-5
print(np.round(experiment_3[3], 4))
# test accuracy; MAE; estimate
print(np.round(np.mean(experiment_3[3]), 4))

#### Naive Approach 4 & 5

Naive approach 4: "simple linear combination" algorithm

Naive approach 5: "full linear combination" algorithm

In [None]:
# code partially adapted from [2] and [3]

def na_linear_combination(df, k, intcept=True):
    
    # setting up k-fold cross validation
    kf = KFold(n_splits=k, shuffle=True)
    # linear model without intercept
    model = linear_model.LinearRegression(fit_intercept=intcept)
    
    # initializing lists
    train_RMSE_per_round = []
    train_MAE_per_round = []
    test_RMSE_per_round = []
    test_MAE_per_round = []
    
    for train_index, test_index in kf.split(df):

        train = df.loc[train_index.tolist()]

        # mean rating per user in the training data
        user_means = train.pivot_table(values='rating', index='user_id', aggfunc='mean')
        user_means = user_means.reset_index()
        user_means.rename(columns={'rating': 'user_mean'}, inplace=True)

        # mean rating per title in the training data
        title_means = train.pivot_table(values='rating', index='title', aggfunc='mean')
        title_means = title_means.reset_index()
        title_means.rename(columns={'rating': 'title_mean'}, inplace=True)

        # add missing users if missing
        if set(train.user_id.unique()) != set(df.user_id.unique()):
            # index of users missing in training set
            mis_ind = np.invert(np.isin(element = df.user_id.unique(), test_elements = train.user_id.unique()))
            # missing users in training set
            mis_use = df.user_id.unique()[mis_ind]
            # dataframe of missing users in training set
            mis_use = pd.DataFrame(data = mis_tit, columns = ['user_id'])
            # adding column with global mean rating as mean rating for user
            mis_use.insert(loc = 1, column = 'user_mean', value = train.rating.mean())
            # adding missing users to user_means in training data 
            user_means = pd.concat(objs = [user_means, mis_tit], axis = 0, ignore_index=True)

        # add missing titles if missing
        if set(train.title.unique()) != set(df.title.unique()):
            # index of titles missing in training set
            mis_ind = np.invert(np.isin(element = df.title.unique(), test_elements = train.title.unique()))
            # missing title in training set
            mis_tit = df.title.unique()[mis_ind]
            # dataframe of missing titles in training set
            mis_tit = pd.DataFrame(data = mis_tit, columns = ['title'])
            # adding column with global mean rating as mean rating for missing title
            mis_tit.insert(loc = 1, column = 'title_mean', value = train.rating.mean())
            # adding missing titles to title_means in training data 
            title_means = pd.concat(objs = [title_means, mis_tit], axis = 0, ignore_index=True)

        # add column with mean value per user
        train = pd.merge(train, user_means)
        # add column with mean value per title 
        train = pd.merge(train, title_means) 

        # x training data
        x_train = train[['user_mean', 'title_mean']]
        # y training data
        y_train = train[['rating']] 

        test = df.loc[test_index.tolist()]
        # add column with mean value per user
        test = pd.merge(test, user_means)
        # add column with mean value per title 
        test = pd.merge(test, title_means) 

        # x testing data
        x_test = test[['user_mean', 'title_mean']]
        # y testing data
        y_test = test[['rating']] 

        # build training model
        model.fit(x_train, y_train) # with intercept
        # apply trained model to make prediction (on test set)
        
        # train accuracy
        y_pred_train = model.predict(x_train)
        # rounding invalid ratings
        y_pred_train[y_pred_train < 1] = 1
        y_pred_train[y_pred_train > 5] = 5
        
        # train RMSE
        RMSE_train = math.sqrt( np.mean( (y_pred_train - y_train)**2 ) )
        # train MAE
        MAE_train = np.mean( abs(y_pred_train - y_train) )[0]
        
        train_RMSE_per_round.append(RMSE_train)
        train_MAE_per_round.append(MAE_train)

        # test accuracy
        y_pred_test = model.predict(x_test)
        # rounding invalid ratings
        y_pred_test[y_pred_test < 1] = 1
        y_pred_test[y_pred_test > 5] = 5
        
        # test RMSE
        RMSE_test = math.sqrt( np.mean( (y_pred_test - y_test)**2 ) )
        # test MAE
        MAE_test = np.mean( abs(y_pred_test - y_test) )[0]

        test_RMSE_per_round.append(RMSE_test)
        test_MAE_per_round.append(MAE_test)
        
    return(train_RMSE_per_round, train_MAE_per_round,
           test_RMSE_per_round, test_MAE_per_round)     

In [None]:
# results for linear combination without intercept (simple linear combination)

# set seed
np.random.seed(seed=123)

# naive approach 4; 5-fold CV
experiment_4 = na_linear_combination(df=trim, k=5, intcept=False)

# train accuracy; RMSE; round 1-5
print(np.round(experiment_4[0], 4))
# train accuracy; RMSE; estimate
print(np.round(np.mean(experiment_2[0]), 4))

# train accuracy; MAE; round 1-5
print(np.round(experiment_4[1], 4))
# train accuracy; MAE; estimate
print(np.round(np.mean(experiment_2[1]), 4))

# test accuracy; RMSE; round 1-5
print(np.round(experiment_4[2], 4))
# test accuracy; RMSE; estimate
print(np.round(np.mean(experiment_2[2]), 4))

# test accuracy; MAE; round 1-5
print(np.round(experiment_4[3], 4))
# test accuracy; MAE; estimate
print(np.round(np.mean(experiment_2[3]), 4))

In [None]:
# results for linear combination with intercept (full linear combination)

# set seed
np.random.seed(seed=123)

# naive approach 5; 5-fold CV
experiment_5 = na_linear_combination(df=trim, k=5, intcept=True)

# train accuracy; RMSE; round 1-5
print(np.round(experiment_2[0], 4))
# train accuracy; RMSE; estimate
print(np.round(np.mean(experiment_5[0]), 4))

# train accuracy; MAE; round 1-5
print(np.round(experiment_2[1], 4))
# train accuracy; MAE; estimate
print(np.round(np.mean(experiment_5[1]), 4))

# test accuracy; RMSE; round 1-5
print(np.round(experiment_2[2], 4))
# test accuracy; RMSE; estimate
print(np.round(np.mean(experiment_5[2]), 4))

# test accuracy; MAE; round 1-5
print(np.round(experiment_2[3], 4))
# test accuracy; MAE; estimate
print(np.round(np.mean(experiment_5[3]), 4))

#### Comparing Naive Approaches

Comparing the achieved accuracy on the test data

1. global average
2. movie average
3. user average
5. linear combination without intercept
4. linear combination with intercept

In [None]:
print('RMSE and MAE on test data using 5-fold CV')
print()

print('Algorithm: "global average rating"')
print('RMSE:', np.round(np.mean(experiment_1[2]), 4))
print('MAE:', np.round(np.mean(experiment_1[3]), 4))
print()

print('Algorithm: "movie average rating"')
print('RMSE:', np.round(np.mean(experiment_2[2]), 4))
print('MAE:', np.round(np.mean(experiment_2[3]), 4))
print()

print('Algorithm: "user average rating"')
print('RMSE:', np.round(np.mean(experiment_3[2]), 4))
print('MAE:', np.round(np.mean(experiment_3[3]), 4))
print()

print('Algorithm: "simple linear combination rating"')
print('RMSE:', np.round(np.mean(experiment_4[2]), 4))
print('MAE:', np.round(np.mean(experiment_4[3]), 4))
print()

print('Algorithm: "full linear combination rating"')
print('RMSE:', np.round(np.mean(experiment_5[2]), 4))
print('MAE:', np.round(np.mean(experiment_5[3]), 4))

### References

[1] https://datanerd.blog/data-analysis-with-python-movielens/

[2] https://github.com/codebasics/py/blob/master/ML/12_KFold_Cross_Validation/12_k_fold.ipynb

[3] https://www.youtube.com/watch?v=R15LjD8aCzc