In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [None]:
df = pd.read_csv('data.csv',header=None)
valid_rating_count = df[df.columns[0]]
df = df.drop(columns=df.columns[0])

In [None]:
X = df.values
X

In [None]:
valid_rating_count

In [None]:
# train test split

def train_test_split(X,no_rating=99,valid_rating_count=None,train_ratio=0.6,validation_ratio=0.2,test_ratio=0.2):
    N = X.shape[0]
    M = X.shape[1]
    
    training_set = np.ones((N,M))*99
    validation_set = np.ones((N,M))*99
    test_set = np.ones((N,M))*99
    
    for i in range(N):
        valid_indices = []
        for j in range(M):
            if X[i][j]!=99:
                valid_indices.append(j)
        
        if valid_rating_count is not None and valid_rating_count[i]!=len(valid_indices):
            print("Error at row index ",i)
            continue
            
        np.random.shuffle(valid_indices) # uniform distribution of indicies
        
        total_valid_indices_size = len(valid_indices)
        training_set_size = int(round(train_ratio*total_valid_indices_size))
        validation_set_size = int(round(validation_ratio*total_valid_indices_size))
        test_set_size = int(round(validation_ratio*total_valid_indices_size))
        
        training_indices = valid_indices[0:training_set_size]
        validation_indices = valid_indices[training_set_size:training_set_size+validation_set_size]
        test_indicies = valid_indices[training_set_size+validation_set_size:total_valid_indices_size]
        
        for j in training_indices:
            training_set[i][j] = X[i][j]
        
        for j in validation_indices:
            validation_set[i][j] = X[i][j]
        
        for j in test_indicies:
            test_set[i][j] = X[i][j]
            
    return training_set,validation_set,test_set

In [None]:
train_set,val_set,test_set = train_test_split(X,no_rating=99,valid_rating_count=valid_rating_count)

In [None]:
def error(data, U, V, l_u, l_v):
    N = data.shape[0]
    M = data.shape[1]
    pred = U*V.T
    error_val = 0.0
    count = 0.0
    for i in range(N):
        for j in range(M):
            if not data[i, j] == 99:
                count += 1.0
                error_val += math.pow(data[i,j]-pred[i,j],2)
    error_val += l_u*np.sum(np.square(U))
    error_val += l_v*np.sum(np.square(V))
    return error_val


def r_error(data, U, V):
    N = data.shape[0]
    M = data.shape[1]
    pred = U*V.T
    error_val = 0.0
    count = 0.0
    for i in range(N):
        for j in range(M):
            if not data[i, j] == 99:
                count += 1.0
                error_val += math.pow(data[i,j]-pred[i,j],2)
    return math.sqrt(error_val/count)


def als(data, K, lambda_u, lambda_v, tolerance):
    N = data.shape[0]
    M = data.shape[1]
    I = np.mat(np.identity(K))
    U = np.mat(np.random.uniform(-10,10,(N,K)))
    V = np.mat(np.zeros((M,K)))
    prev_error = None

    while True:
        for i in range(M):
            temp1 = np.mat(np.zeros((K,K)))
            temp2 = np.mat(np.zeros((K,1)))
            for j in range(N):
                if not data[j, i] == 99:
                    temp1 = temp1 + U[j].T*U[j] + lambda_v*I
                    temp2 = temp2 + data[j, i]*U[j].T
            V[i] = (np.linalg.inv(temp1)*temp2).T

        for i in range(N):
            temp1 = np.mat(np.zeros((K, K)))
            temp2 = np.mat(np.zeros((K, 1)))
            for j in range(M):
                if not data[i, j] == 99:
                    temp1 = temp1 + V[j].T * V[j] + lambda_u * I
                    temp2 = temp2 + data[i, j] * V[j].T
            U[i] = (np.linalg.inv(temp1) * temp2).T

        if prev_error is None:
            prev_error = error(data,U,V,lambda_u,lambda_v)
        else:
            curr_error = error(data,U,V,lambda_u,lambda_v)
            difference = abs(prev_error-curr_error)
            change = difference/curr_error
            if change < tolerance:
                print(curr_error)
                break
            prev_error = curr_error
        print(prev_error)
    return U, V


In [None]:
latent_factors = [5,10,20,40]
lambda_values = [10.0,1.0,0.1,0.01]
lowest_error = None
best_k = None
best_l_u = None
best_l_v = None
best_U = None
best_V = None

for k in latent_factors:
    for l_u in lambda_values:
        for l_v in lambda_values:
            U,V = als(train_set,k,l_u,l_v,0.00005)
            curr_error = r_error(train_set,U,V)
            print(k,l_u,l_v,curr_error)
            if lowest_error is None or lowest_error>curr_error:
                lowest_error = curr_error
                best_k = k
                best_l_u = l_u
                best_l_v = l_v
                best_U = U
                best_V = V

output = open('model.pk1','wb')
pickle.dump(U,output,pickle.HIGHEST_PROTOCOL)
pickle.dump(V,output,pickle.HIGHEST_PROTOCOL)
output.close()      