In [140]:
# imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import os
import random

Loading the Data

In [141]:
# dataset root
root_dataset_folder = os.path.join('..', 'data', 'ml-100k')

# u.user (Users)
user_columns = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv(os.path.join(root_dataset_folder, 'u.user'), sep='|', names=user_columns, encoding='latin-1')

# u.items (Movies)
movie_columns = [
    'movie_id', 
    'movie_title' ,
    'release_date',
    'video_release_date', 
    'IMDb_URL', 
    'unknown', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children\'s', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War',
    'Western'
]
movies = pd.read_csv(os.path.join(root_dataset_folder, 'u.item'), sep='|', names=movie_columns, encoding='latin-1')

# (Ratings)
rating_columns = [
    'user_id', 
    'movie_id', 
    'rating', 
    'unix_timestamp'
]
# ua.base
data_base = pd.read_csv(os.path.join(root_dataset_folder, 'ua.base'), sep='\t', names=rating_columns, encoding='latin-1')
# ua.test
data_test = pd.read_csv(os.path.join(root_dataset_folder, 'ua.test'), sep='\t', names=rating_columns, encoding='latin-1')

Generating the Sparse Rating Matrix

In [142]:
#Creating sparse_rating_matrix:
n_users = users.user_id.unique().shape[0]
n_movies = movies.movie_id.unique().shape[0]

ratings = np.zeros((n_users, n_movies))
# reading ratings from 
for row in data_base.itertuples():
    ratings[row[1]-1 , row[2] -1] = row[3]

sparse_rating_matrix = pd.DataFrame(data=ratings,index=users['user_id'],columns=movies['movie_id'])
sparse_rating_matrix = sparse_rating_matrix.replace('0',np.nan)
print(sparse_rating_matrix)

movie_id  1     2     3     4     5     6     7     8     9     10    ...   \
user_id                                                               ...    
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...    
2          4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...    
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
5          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
6          4.0   NaN   NaN   NaN   NaN   NaN   2.0   4.0   4.0   NaN  ...    
7          NaN   NaN   NaN   5.0   NaN   NaN   5.0   5.0   5.0   4.0  ...    
8          NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN   NaN   NaN  ...    
9          NaN   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...    
10         4.0   NaN   NaN   4.0   NaN   NaN   NaN   NaN   4.0   NaN  ...    
11         NaN   NaN   NaN   NaN   NaN   NaN   NaN   4.0   5.0  

Subtractive Normalization

In [143]:
# r_u
r_u = sparse_rating_matrix.mean(axis=1)
r_u = np.asarray([[element for j in range(n_movies)] for element in r_u])
print(r_u)

[[ 3.60305344  3.60305344  3.60305344 ...,  3.60305344  3.60305344
   3.60305344]
 [ 3.75        3.75        3.75       ...,  3.75        3.75        3.75      ]
 [ 2.90909091  2.90909091  2.90909091 ...,  2.90909091  2.90909091
   2.90909091]
 ..., 
 [ 3.83333333  3.83333333  3.83333333 ...,  3.83333333  3.83333333
   3.83333333]
 [ 4.30434783  4.30434783  4.30434783 ...,  4.30434783  4.30434783
   4.30434783]
 [ 3.39240506  3.39240506  3.39240506 ...,  3.39240506  3.39240506
   3.39240506]]


In [144]:
# r_i
r_i = sparse_rating_matrix.mean(axis=0).tolist()
r_i = np.asarray([r_i for i in range(n_users)])
print(r_i)

[[ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 ..., 
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]
 [ 3.85969388  3.19834711  3.05882353 ...,  2.          3.          3.        ]]


In [145]:
# r
r_value = sparse_rating_matrix.mean().mean()
r   = np.zeros((n_users, n_movies))
r.fill(r_value) 
print(r)

[[ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 ..., 
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]
 [ 3.06679224  3.06679224  3.06679224 ...,  3.06679224  3.06679224
   3.06679224]]


In [146]:
# starting values for alpha, beta and gamma
# TODO think of how to start from a sensible point 
alpha = random.uniform(0, 1)
beta = random.uniform(0, 1)
gamma = random.uniform(0, 1)
print(alpha, beta, gamma)

(0.580332870725874, 0.5163198650266884, 0.6618013203132993)


In [147]:
N = 0 # the number of ratings that are not NaN
for element in sparse_rating_matrix.values :
    for e in element : 
        if(not np.isnan(e)) : 
            N += 1

Error Matrix

In [148]:
def error_matrix() : 
    return np.subtract(sparse_rating_matrix.values, np.add((alpha*r), (beta*r_u), (gamma*r_i)))

Square Error

In [149]:
def square_error() :
    err_matrix = error_matrix()
    return np.nansum(np.multiply(err_matrix, err_matrix))

Gradient Descent (single iteration)   

In [150]:
learning_rate = 1

def step_gradient() :
    global alpha, beta, gamma
    
    # calculating gradients
    alpha_grad = -(2.0/N)*((square_error()*r_value))
    beta_grad = -(2.0/N)*np.dot(np.nan_to_num(error_matrix()), np.nan_to_num(np.transpose(r_u))).sum()
    gamma_grad = -(2.0/N)*np.dot(np.nan_to_num(error_matrix()), np.nan_to_num(np.transpose(r_i))).sum()
    
    # updating parameters
    alpha -= alpha_grad
    beta -= beta_grad
    gamma -= gamma_grad

Gradient Descent (Linear Regression)

In [151]:
for i in range(1000) : 
    print(square_error())
    step_gradient()
    print(alpha, beta, gamma)

101114.617244
(7.4280202677143121, -509.37477375157783, -25.223058338938042)
290418456165.0
(19667735.560056552, 12014700.667028045, 11843700.860062812)
9.57107541043e+20
(64817268036764032.0, -694599574549.14807, -683216379618.92029)
3.57868793246e+39
(2.4235602053149083e+35, -1.3450344983767055e+21, -1.3210704055883127e+21)
5.00334794022e+76
(3.3883689190302567e+72, -5.0292342165859193e+39, -4.9396300115951677e+39)
9.77990100069e+150
(6.6231477358463312e+146, -7.0313503533489029e+76, -6.9060751064046528e+76)
3.73664607725e+299
(2.5305326714887532e+295, -1.3743979267184752e+151, -1.3499107327917346e+151)
inf


  app.launch_new_instance()


(inf, -5.2512173907337482e+299, -5.1576581848453944e+299)
inf
(inf, -inf, -inf)
0.0


  from ipykernel import kernelapp as app


(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0
(inf, -inf, -inf)
0.0


KeyboardInterrupt: 

Need regularization (as seen from above run)

In [152]:
# starting values for alpha, beta and gamma
# TODO think of how to start from a sensible point 
alpha = random.uniform(0, 1)
beta = random.uniform(0, 1)
gamma = random.uniform(0, 1)
print(alpha, beta, gamma)

(0.7096946440082428, 0.5328473728708358, 0.07457650759813483)


In [154]:
def error_matrix_with_L2_regularization() : 
    return np.subtract(sparse_rating_matrix.values, np.add((alpha*r), (beta*r_u), (gamma*r_i))) + (alpha**2) + (beta**2) + (gamma**2)

def square_error_with_L2_regularization() :
    err_matrix = error_matrix()
    return np.nansum(np.multiply(err_matrix, err_matrix))

learning_rate = 1
regularization_parameter = 10

def step_gradient_with_L2_regularization() :
    global alpha, beta, gamma
    
    # calculating gradients
    alpha_grad = -(2.0/N)*(square_error_with_L2_regularization()*((2*alpha) - r_value))
    beta_grad = -(2.0/N)*np.dot(np.nan_to_num(error_matrix()), (2*beta - np.nan_to_num(np.transpose(r_u)))).sum()
    gamma_grad = -(2.0/N)*np.dot(np.nan_to_num(error_matrix()), (2*gamma - np.nan_to_num(np.transpose(r_i)))).sum()
    
    # updating parameters
    alpha -= alpha_grad
    beta -= beta_grad
    gamma -= gamma_grad

In [155]:
for i in range(1000) : 
    print(square_error_with_L2_regularization())

    step_gradient_with_L2_regularization()
    print(alpha, beta, gamma)

125780.048635
(-3.866002026897311, 2523.0545351726937, 2901.8177612923459)
7.25276800395e+12
(-1729516714.9017489, -84407043928.21994, -97088183331.452896)
8.43335129414e+27
(-6.4417012427022932e+32, -9.638734257071231e+25, -1.108683772205853e+26)
3.53471487145e+71
(-1.00561232991024e+100, -7.1825176277816219e+62, -8.2616041952417076e+62)
8.6141862807e+205
(-3.8257842270122607e+301, -8.3553246693852132e+166, -9.6106113369218989e+166)
inf




(-inf, -inf, -inf)
inf
(-inf, nan, nan)
0.0




(nan, nan, nan)
0.0
(nan, nan, nan)
0.0
(nan, nan, nan)
0.0


KeyboardInterrupt: 

In [161]:
def r(u, i) : 
    return sparse_rating_matrix.values[u][i]

r_val = sparse_rating_matrix.mean().mean()
def r_() : 
    return r_val

r_u_value = sparse_rating_matrix.mean(axis=1)
def r_u(i) : 
    return r_u_value[i]

# r_i
r_i_value = sparse_rating_matrix.mean(axis=0).tolist()
def r_i(u) : 
    return r_i_value[u]

print(r(1,2))
print(r_())
print(r_u(10))
print(r_i(100))

nan
3.06679224255
4.20114942529
3.2676056338
