# Recommendation Systems

In [None]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
from sklearn import linear_model

In [None]:
# loading in the data
# adapted code from [1]

users_columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users = pd.read_table('users.dat', sep='::', header=None, names=users_columns, engine='python')

ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')

# merging dataframes
merged = pd.merge(pd.merge(ratings, users), movies)

In [None]:
# grabbing columns that we will use
trim = merged[['user_id', 'title','rating']]

## Matrix Facorization

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [4]:
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
df_movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')
df_merged = pd.merge(df_ratings, df_movies) 

In [5]:
num_factors=10
num_iter=75
regularization=0.05
learn_rate=0.005

In [6]:
np.random.seed(1938266)
X = df_merged.pivot(index = "user_id", columns = "title", values = "rating") 
X = np.array(X)

In [7]:
a = np.nanmean(X) 
init_val = (a/num_factors)**0.5
U = np.random.normal(init_val,0.1,(X.shape[0],num_factors))
M = np.random.normal(init_val,0.1,(num_factors,X.shape[1]))

In [8]:
init_val

0.5984617325301023

In [9]:
#Define function(s)

def Rmse(M,U,X):
    x_hat =np.dot(U,M)
    
    nominator = 0
    denominator = 0
    rows, cols =  np.where(~np.isnan(X))
    for element in zip(rows,cols):
        i = element[0]
        j = element[1]
        nominator +=(X[i,j]-x_hat[i,j])**2
        denominator += 1
    mean=float(nominator)/ denominator   #np.count_nonzero(~np.isnan(M))
    
    out = round(mean**(0.5),4)
    
    #print(out)
    
    return out
    

In [10]:
def Matrix_factorization(X,U,M):
    old_rmse = 0
    new_rmse = 1
    count = 0
    
    rows, cols =  np.where(~np.isnan(X))
    while round(old_rmse,2) != round(new_rmse,2):
        for element in zip(rows,cols):
            i = element[0]
            j = element[1]
            x_hat = np.dot(U[i,:],M[:,j])
            error = X[i,j] -  x_hat
            U[i,:] = U[i,:] + learn_rate * (2 * error * M[:,j] - regularization * U[i,:])
            M[:,j] = M[:,j] + learn_rate * (2 * error * U[i,:] - regularization * M[:,j])
            count += 1
            
        #if count % 5 == 0:
            #print(f"{count / total *100} percent")
        old_rmse = new_rmse
        new_rmse = Rmse(M,U,X)
        print(new_rmse)
        
    return

In [11]:
Rmse(M,U,X)

1.1433

In [12]:
def mf_cv(X, U, M, k):
    
    # initializing list
    rmse_per_fold = []
    
    # k-fold cross validation
    kf = KFold(n_splits=k, shuffle=True)
    
    # rows and columns where entries are recorded
    rows, cols =  np.where(~np.isnan(X))
    
    # initializing counter
    count = 1
    
    # Indices of training and test data 
    for train_index, test_index in kf.split(rows):
        
        # We'll make copied so that the original input values aren't changed
        Uc = np.copy(U)
        Mc = np.copy(M)
        # X matrix for training
        X_train = np.copy(X)
        
        # Training data row and column indices 
        train_rows = rows[train_index]
        train_cols = cols[train_index]
        
        # Test data row and column indices
        test_rows = rows[test_index]
        test_cols = cols[test_index]
        
        # The 'matrix_factorization' function ignores 'nan' entries when updating 'U' and 'M'
        # Changing entries that will be used for testing to 'nan' in the training matrix
        X_train[test_rows, test_cols] = np.nan
    
        print('Training RMSE fold', count, ':')
        # output printed ...
        Matrix_factorization(X=X_train, U=Uc, M=Mc) 
        
        # X matrix for testing
        X_test = np.copy(X)
        
        # The 'Rmse' function ignores 'nan' entries when calculating the RMSE
        # Changing entries that have been used for training to 'nan' in the test matrix
        X_test[train_rows, train_cols] = np.nan
        
        # appending test RMSE to list
        rmse_per_fold.append(Rmse(X=X_test, U=Uc, M=Mc))
        
        print('Test RMSE fold', count, ':', Rmse(X=X_test, U=Uc, M=Mc))
        print('')
        
        count += 1
        
    return(rmse_per_fold)

In [13]:
init_val = (np.nanmean(X)/num_factors)**0.5
U = np.random.normal(loc=init_val, scale=0.1, size=(X.shape[0], num_factors))
M = np.random.normal(loc=init_val, scale=0.1, size=(num_factors, X.shape[1]))

np.random.seed(1938266)
mf_5f_cv = mf_cv(X=X, U=U, M=M, k=5)

Training RMSE fold 1 :
0.9254
0.9167
0.9144
0.9108
Test RMSE fold 1 : 0.9267

Training RMSE fold 2 :
0.9245
0.916
Test RMSE fold 2 : 0.9294

Training RMSE fold 3 :
0.924
0.9154
Test RMSE fold 3 : 0.9272

Training RMSE fold 4 :
0.9245
0.9158
Test RMSE fold 4 : 0.9289

Training RMSE fold 5 :
0.925
0.9163
0.9139
0.9102
Test RMSE fold 5 : 0.9246



In [14]:
np.mean(mf_5f_cv)

0.92736