# Recommendation Systems

In [None]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import math
from sklearn import linear_model

In [None]:
# loading in the data
# adapted code from [1]

users_columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
users = pd.read_table('users.dat', sep='::', header=None, names=users_columns, engine='python')

ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')

# merging dataframes
merged = pd.merge(pd.merge(ratings, users), movies)

In [None]:
# grabbing columns that we will use
trim = merged[['user_id', 'title','rating']]

## UV Decomposition

In [None]:
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
df_movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')
df_merged = pd.merge(df_ratings, df_movies) 

In [None]:
np.random.seed(1938266)
M = df_merged.pivot(index = "user_id", columns = "title", values = "rating") 

a = np.nanmean(M)  # nominator
d = 2# For now 10, this represents the short side of U and V matrices. Can be changed later

init_val = (a/d)**0.5

U = np.random.normal(init_val,0.3,(M.shape[0],d))
V = np.random.normal(init_val,0.3,(d,M.shape[1]))
M = np.array(M)

In [None]:
# Define functions
def Normalize(data):
    global row_means 
    row_means = np.nanmean(M, axis = 1).reshape(-1,1)
    row_means[np.isnan(row_means)] = 0
    row_norm = data  - row_means #data.sub(row_means, axis=0)
    global col_means
    col_means = np.nanmean(row_norm, axis = 0)
    norm_matrix = (row_norm - col_means)
    col_means[np.isnan(col_means)] = 0 #row_norm.sub(col_means, axis = 1)
    return norm_matrix #, row_means, col_means

def Denormalize(data):
    row_norm = data  + row_means #data.sub(row_means, axis=0)
    norm_matrix = (row_norm + col_means) #row_norm.sub(col_means, axis = 1)
    return norm_matrix

def Rmse(M,U,V,denormalize = False):
    P =np.dot(U,V)
    if denormalize == True:
        P = Denormalize(P)
    nominator = 0
    denominator = 0
    rows, cols =  np.where(~np.isnan(M))
    for element in zip(rows,cols):
        i = element[0]
        j = element[1]
        nominator +=(M[i,j]-P[i,j])**2
        denominator += 1
    mean=float(nominator)/ denominator   #np.count_nonzero(~np.isnan(M))
    
    out = round(mean**(0.5),4)
    
    #print(out)
    
    return out
def Update_U(M,U,V,show_output = False):
    count = 0
    total = U.shape[0] * U.shape[1]
    for r in np.arange(U.shape[0]):
        for s in np.arange(U.shape[1]):
            m_r = np.array(M[r,:]).astype("float")
            if  np.isnan(m_r).all():
                continue
            v_s = np.array(V[s,:]).astype("float")
            v_s[np.isnan(m_r)]= np.nan
            # To be able to skip nan values where m_{i}{j} is nan
            second_sum = np.dot(np.delete(U[r,:],s,0),np.delete(V[:],s,0))
            
            numerator = np.nansum(V[s,:]*(m_r - second_sum))
            
            denominator=np.nansum(np.square(v_s))
            
            U[r,s]= numerator/denominator
            count += 1
        if show_output == True and count % 50 == 0:
            print(f" We are {count / total * 100} percent there , {U[r,s]}")
    return 

def Update_V(M,U,V,show_output = False):
    count = 0
    total = V.shape[0] *V.shape[1]
    for s in np.arange(V.shape[1]):
        for r in np.arange(V.shape[0]):
            m_s=np.array(M[:,s])
            if  np.isnan(m_s).all():
                continue
            u_r=np.array(U[:,r])
            u_r[np.isnan(m_s)]=np.nan
            second_sum = np.dot(np.delete(U[:],r,1),np.delete(V[:,s],r,0))
            numerator = np.nansum(U[:, r] * (m_s - second_sum))
            denominator=np.nansum(np.square(u_r))
            V[r,s]=numerator/denominator
            count += 1
        if show_output == True and count % 50 == 0:
            print(f"We are {count/total * 100} percent there, {V[r,s]} ")
    return 

def Train_function(data,U,V, denormalize = False):
    
    old_rmse = 0
    new_rmse = 1
    while round(old_rmse,2) != round(new_rmse,2):
        Update_U(data,U,V)
        Update_V(data,U,V)
        old_rmse = new_rmse
        new_rmse = Rmse(data,U,V, denormalize = denormalize)
    return 

def Uv_cv(M,U,V,k = 5):
    
    # initializing list
    rmse_per_fold = []
    
    # k-fold cross validation
    kf = KFold(n_splits=k, shuffle = True, random_state=6)
    
    # rows and columns where entries are recorded
    rows, cols =  np.where(~np.isnan(M))
    
    # initializing counter
    count = 1
    
    for train_index, test_index in kf.split(rows):
        # We'll make copied so that the original input values aren't changed
        Uc = np.copy(U)
        Vc = np.copy(V)
        
        # X matrix for training
        M_train = np.copy(M)

        # Training data row and column indices 
        train_rows = rows[train_index]
        train_cols = cols[train_index]
        
        # Test data row and column indices
        test_rows = rows[test_index]
        test_cols = cols[test_index]
        
        # Changing entries that will be used for testing to 'nan' in the training matrix
        M_train[test_rows, test_cols] = np.nan
        
        print(f'Training RMSE fold {count} :')
        
        Train_function(data = M_train, U = Uc, V = Vc, denormalize = False)
        
        # X matrix for testing
        M_test = np.copy(M)
        
        # The 'Rmse' function ignores 'nan' entries when calculating the RMSE
        # Changing entries that have been used for training to 'nan' in the test matrix
        M_test[train_rows, train_cols] = np.nan
        
        # appending test RMSE to list
        rmse_per_fold.append(Rmse(M = M_test,U = Uc,V = Vc ,denormalize = False))
        
        print('Test RMSE fold', count, ':', Rmse(M = M_test,U = Uc,V = Vc ,denormalize = False))
        print('')
        
        count += 1
        
    return(rmse_per_fold)

In [None]:
# RMSE on full data before training
Rmse(M,U,V)

In [None]:
init_val = (a/d)**0.5
U = np.random.normal(init_val,0.3,(M.shape[0],d)) #0.9181
V = np.random.normal(init_val,0.3,(d,M.shape[1]))

np.random.seed(1330)
mf_5f_cv = Uv_cv(M = M, U=U, V=V, k=5)

In [None]:
np.mean(mf_5f_cv) # 0.979

### CV with normalization

In [None]:
def Uv_normal_cv(M,U,V,k = 5):
    
    # initializing list
    rmse_per_fold = []
    
    # k-fold cross validation
    kf = KFold(n_splits=k, shuffle = True, random_state=6)
    
    # rows and columns where entries are recorded
    rows, cols =  np.where(~np.isnan(M))
    
    # initializing counter
    count = 1
    
    for train_index, test_index in kf.split(rows):
        # We'll make copied so that the original input values aren't changed
        Uc = np.copy(U)
        Vc = np.copy(V)
        
        # X matrix for training
        M_train = np.copy(M)
        
        
        
        # Training data row and column indices 
        train_rows = rows[train_index]
        train_cols = cols[train_index]
        
        # Test data row and column indices
        test_rows = rows[test_index]
        test_cols = cols[test_index]
        
        # Changing entries that will be used for testing to 'nan' in the training matrix
        M_train[test_rows, test_cols] = np.nan
        
        M_train = Normalize(M_train)
        
        
        print(f'Training RMSE fold {count} :')
        
        Train_function(data = M_train, U = Uc, V = Vc, denormalize = True)
        
        # X matrix for testing
        M_test = np.copy(M)
        
        # The 'Rmse' function ignores 'nan' entries when calculating the RMSE
        # Changing entries that have been used for training to 'nan' in the test matrix
        M_test[train_rows, train_cols] = np.nan
        
        # appending test RMSE to list
        rmse_per_fold.append(Rmse(M = M_test,U = Uc,V = Vc ,denormalize = True))
        
        print('Test RMSE fold', count, ':', Rmse(M = M_test,U = Uc,V = Vc ,denormalize = True))
        print('')
        
        count += 1
        
    return(rmse_per_fold)

In [None]:
init_val = 0
U = np.random.normal(init_val,0.3,(M.shape[0],d))
V = np.random.normal(init_val,0.3,(d,M.shape[1]))

np.random.seed(1337)
mf_5f_cv = Uv_normal_cv(M = M, U=U, V=V, k=5)
#The CV gives an error due to some rows having only NA values since we romove them from the train matrix
#put them in test matrix. The algorithm still works fine since we change NA values to 0 in the normalizing 
#stage and everything works fine. So, the error can be ignored

In [None]:
np.mean(mf_5f_cv)

In [13]:
init_val = (np.nanmean(X)/num_factors)**0.5
U = np.random.normal(loc=init_val, scale=0.1, size=(X.shape[0], num_factors))
M = np.random.normal(loc=init_val, scale=0.1, size=(num_factors, X.shape[1]))

np.random.seed(1938266)
mf_5f_cv = mf_cv(X=X, U=U, M=M, k=5)

Training RMSE fold 1 :
0.9254
0.9167
0.9144
0.9108
Test RMSE fold 1 : 0.9267

Training RMSE fold 2 :
0.9245
0.916
Test RMSE fold 2 : 0.9294

Training RMSE fold 3 :
0.924
0.9154
Test RMSE fold 3 : 0.9272

Training RMSE fold 4 :
0.9245
0.9158
Test RMSE fold 4 : 0.9289

Training RMSE fold 5 :
0.925
0.9163
0.9139
0.9102
Test RMSE fold 5 : 0.9246



In [14]:
np.mean(mf_5f_cv)

0.92736