In [335]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [336]:
df = pd.read_csv('data.csv',header=None)
valid_rating_count = df[df.columns[0]]
df = df.drop(columns=df.columns[0])

In [337]:
X = df.values
X

array([[-7.82,  8.79, -9.66, ..., 99.  , 99.  , 99.  ],
       [ 4.08, -0.29,  6.36, ...,  0.34, -4.32,  1.07],
       [99.  , 99.  , 99.  , ..., 99.  , 99.  , 99.  ],
       ...,
       [99.  , 99.  , 99.  , ..., 99.  , 99.  , 99.  ],
       [99.  , 99.  , 99.  , ..., 99.  , 99.  , 99.  ],
       [ 2.43,  2.67, -3.98, ..., 99.  , 99.  , 99.  ]])

In [338]:
valid_rating_count

0         74
1        100
2         49
3         48
4         91
        ... 
24978    100
24979     91
24980     39
24981     37
24982     72
Name: 0, Length: 24983, dtype: int64

In [339]:
# train test split

def train_test_split(X,no_rating=99,valid_rating_count=None,train_ratio=0.6,validation_ratio=0.2,test_ratio=0.2):
    N = X.shape[0]
    M = X.shape[1]
    
    training_set = np.ones((N,M))*99
    validation_set = np.ones((N,M))*99
    test_set = np.ones((N,M))*99
    
    for i in range(N):
        valid_indices = []
        for j in range(M):
            if X[i][j]!=99:
                valid_indices.append(j)
        
        if valid_rating_count is not None and valid_rating_count[i]!=len(valid_indices):
            print("Error at row index ",i)
            continue
            
        np.random.shuffle(valid_indices) # uniform distribution of indicies
        
        total_valid_indices_size = len(valid_indices)
        training_set_size = int(round(train_ratio*total_valid_indices_size))
        validation_set_size = int(round(validation_ratio*total_valid_indices_size))
        test_set_size = int(round(validation_ratio*total_valid_indices_size))
        
        training_indices = valid_indices[0:training_set_size]
        validation_indices = valid_indices[training_set_size:training_set_size+validation_set_size]
        test_indicies = valid_indices[training_set_size+validation_set_size:total_valid_indices_size]
        
        for j in training_indices:
            training_set[i][j] = X[i][j]
        
        for j in validation_indices:
            validation_set[i][j] = X[i][j]
        
        for j in test_indicies:
            test_set[i][j] = X[i][j]
            
    return training_set,validation_set,test_set

In [340]:
train_set,val_set,test_set = train_test_split(X,no_rating=99,valid_rating_count=valid_rating_count)

In [341]:
def error(data, U, V, l_u, l_v):
    N = data.shape[0]
    M = data.shape[1]
    pred = U*V.T
    error_val = 0.0
    count = 0.0
    for i in range(N):
        for j in range(M):
            if not data[i, j] == 99:
                count += 1.0
                error_val += math.pow(data[i,j]-pred[i,j],2)
    error_val += l_u*np.sum(np.square(U))
    error_val += l_v*np.sum(np.square(V))
    return error_val


def r_error(data, U, V):
    N = data.shape[0]
    M = data.shape[1]
    pred = U*V.T
    error_val = 0.0
    count = 0.0
    for i in range(N):
        for j in range(M):
            if not data[i, j] == 99:
                count += 1.0
                error_val += math.pow(data[i,j]-pred[i,j],2)
    return math.sqrt(error_val/count)


def als(data, K, lambda_u, lambda_v, tolerance):
    N = data.shape[0]
    M = data.shape[1]
    I = np.mat(np.identity(K))
    U = np.mat(np.random.uniform(-10,10,(N,K)))
    V = np.mat(np.zeros((M,K)))
    prev_error = None

    while True:
        for i in range(M):
            temp1 = np.mat(np.zeros((K,K)))
            temp2 = np.mat(np.zeros((K,1)))
            for j in range(N):
                if not data[j, i] == 99:
                    temp1 = temp1 + U[j].T*U[j] + lambda_v*I
                    temp2 = temp2 + data[j, i]*U[j].T
            V[i] = (np.linalg.inv(temp1)*temp2).T

        for i in range(N):
            temp1 = np.mat(np.zeros((K, K)))
            temp2 = np.mat(np.zeros((K, 1)))
            for j in range(M):
                if not data[i, j] == 99:
                    temp1 = temp1 + V[j].T * V[j] + lambda_u * I
                    temp2 = temp2 + data[i, j] * V[j].T
            U[i] = (np.linalg.inv(temp1) * temp2).T

        if prev_error is None:
            prev_error = error(data,U,V,lambda_u,lambda_v)
        else:
            curr_error = error(data,U,V,lambda_u,lambda_v)
            difference = abs(prev_error-curr_error)
            change = difference/curr_error
            if change < tolerance:
                print(curr_error)
                break
            prev_error = curr_error
        print(prev_error)
    return U, V


In [None]:
als(train_set,10,0.1,0.1,0.0005)

30593032.80643414
14421254.392713971
