In [122]:
# imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import os
import random

Loading the Data

In [123]:
# dataset root
root_dataset_folder = os.path.join('..', 'data', 'ml-100k')

# u.user (Users)
user_columns = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv(os.path.join(root_dataset_folder, 'u.user'), sep='|', names=user_columns, encoding='latin-1')

# u.items (Movies)
movie_columns = [
    'movie_id', 
    'movie_title' ,
    'release_date',
    'video_release_date', 
    'IMDb_URL', 
    'unknown', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children\'s', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War',
    'Western'
]
movies = pd.read_csv(os.path.join(root_dataset_folder, 'u.item'), sep='|', names=movie_columns, encoding='latin-1')

# (Ratings)
rating_columns = [
    'user_id', 
    'movie_id', 
    'rating', 
    'unix_timestamp'
]
# ua.base
data_base = pd.read_csv(os.path.join(root_dataset_folder, 'ua.base'), sep='\t', names=rating_columns, encoding='latin-1')
# ua.test
data_test = pd.read_csv(os.path.join(root_dataset_folder, 'ua.test'), sep='\t', names=rating_columns, encoding='latin-1')

Generating the Sparse Rating Matrix

In [124]:
#Creating sparse_rating_matrix:
n_users = users.user_id.unique().shape[0]
n_movies = movies.movie_id.unique().shape[0]

ratings = np.zeros((n_users, n_movies))
# reading ratings from 
for row in data_base.itertuples():
    ratings[row[1]-1 , row[2] -1] = row[3]

sparse_rating_matrix = pd.DataFrame(data=ratings,index=users['user_id'],columns=movies['movie_id'])
sparse_rating_matrix = sparse_rating_matrix.replace('0',np.nan)
print(sparse_rating_matrix)

movie_id  1     2     3     4     5     6     7     8     9     10    ...   \
user_id                                                               ...    
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...    
2          4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...    
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
5          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
6          4.0   NaN   NaN   NaN   NaN   NaN   2.0   4.0   4.0   NaN  ...    
7          NaN   NaN   NaN   5.0   NaN   NaN   5.0   5.0   5.0   4.0  ...    
8          NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN   NaN   NaN  ...    
9          NaN   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...    
10         4.0   NaN   NaN   4.0   NaN   NaN   NaN   NaN   4.0   NaN  ...    
11         NaN   NaN   NaN   NaN   NaN   NaN   NaN   4.0   5.0  

Subtractive Normalization

In [125]:
# r_u
r_u = sparse_rating_matrix.mean(axis=1)
r_u = np.asarray([[element for j in range(n_movies)] for element in r_u])
print(r_u)

[[ 3.60305344  3.60305344  3.60305344 ...,  3.60305344  3.60305344
   3.60305344]
 [ 3.75        3.75        3.75       ...,  3.75        3.75        3.75      ]
 [ 2.90909091  2.90909091  2.90909091 ...,  2.90909091  2.90909091
   2.90909091]
 ..., 
 [ 3.83333333  3.83333333  3.83333333 ...,  3.83333333  3.83333333
   3.83333333]
 [ 4.30434783  4.30434783  4.30434783 ...,  4.30434783  4.30434783
   4.30434783]
 [ 3.39240506  3.39240506  3.39240506 ...,  3.39240506  3.39240506
   3.39240506]]


In [126]:
# r_i
r_i = sparse_rating_matrix.mean(axis=0).tolist()
r_i = np.asarray([r_i for i in range(n_users)])
print(r_i)

[[ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 ..., 
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]]


In [127]:
# r
r_value = sparse_rating_matrix.mean().mean()
r   = np.zeros((n_users, n_movies))
r.fill(r_value) 
print(r)

[[ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 ..., 
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]]


In [None]:
# starting values for alpha, beta and gamma
# TODO think of how to start from a sensible point 
alpha = random.uniform(0, 1)
beta = random.uniform(0, 1)
gamma = random.uniform(0, 1)
print(alpha, beta, gamma)

(0.7402987531804672, 0.8963185766186846, 0.3454523247105159)


In [None]:
N = 0 # the number of ratings that are not NaN
for element in sparse_rating_matrix.values :
    for e in element : 
        if(not np.isnan(e)) : 
            N += 1

Error Matrix

In [None]:
def error_matrix() : 
    return np.subtract(sparse_rating_matrix.values, np.add((alpha*r), (beta*r_u), (gamma*r_i)))

Square Error

In [None]:
def square_error() :
    err_matrix = error_matrix()
    return np.nansum(np.multiply(err_matrix, err_matrix))

Gradient Descent (single iteration)   

In [None]:
def step_gradient() :
    global alpha, beta, gamma
    
    # calculating gradients
    alpha_grad = -(2.0/N)*((square_error()*r_value))
    beta_grad = -np.dot(np.nan_to_num(error_matrix()), np.nan_to_num(np.transpose(r_u))).sum()
    gamma_grad = -np.dot(np.nan_to_num(error_matrix()), np.nan_to_num(np.transpose(r_i))).sum()
    
    # updating parameters
    alpha -= alpha_grad
    beta -= beta_grad
    gamma -= gamma_grad

Gradient Descent (Linear Regression)

In [None]:
for i in range(1000) : 
    print(square_error())
    step_gradient()
    print(alpha, beta, gamma)