In [2]:
# imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import os
import random

In [3]:
# dataset root
root_dataset_folder = os.path.join('..', 'data', 'ml-100k')

# u.user (Users)
user_columns = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv(os.path.join(root_dataset_folder, 'u.user'), sep='|', names=user_columns, encoding='latin-1')

# u.items (Movies)
movie_columns = [
    'movie_id', 
    'movie_title' ,
    'release_date',
    'video_release_date', 
    'IMDb_URL', 
    'unknown', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children\'s', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War',
    'Western'
]
movies = pd.read_csv(os.path.join(root_dataset_folder, 'u.item'), sep='|', names=movie_columns, encoding='latin-1')

# (Ratings)
rating_columns = [
    'user_id', 
    'movie_id', 
    'rating', 
    'unix_timestamp'
]
# ua.base
data_base = pd.read_csv(os.path.join(root_dataset_folder, 'ua.base'), sep='\t', names=rating_columns, encoding='latin-1')
# ua.test
data_test = pd.read_csv(os.path.join(root_dataset_folder, 'ua.test'), sep='\t', names=rating_columns, encoding='latin-1')

In [4]:
#Creating sparse_rating_matrix:
n_users = users.user_id.unique().shape[0]
n_movies = movies.movie_id.unique().shape[0]

ratings = np.zeros((n_users, n_movies))
# reading ratings from 
for row in data_base.itertuples():
    ratings[row[1]-1 , row[2] -1] = row[3]

sparse_rating_matrix = pd.DataFrame(data=ratings,index=users['user_id'],columns=movies['movie_id'])
sparse_rating_matrix = sparse_rating_matrix.replace('0',np.nan)
print(sparse_rating_matrix)

movie_id  1     2     3     4     5     6     7     8     9     10    ...   \
user_id                                                               ...    
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...    
2          4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...    
3          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
4          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
5          NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...    
6          4.0   NaN   NaN   NaN   NaN   NaN   2.0   4.0   4.0   NaN  ...    
7          NaN   NaN   NaN   5.0   NaN   NaN   5.0   5.0   5.0   4.0  ...    
8          NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN   NaN   NaN  ...    
9          NaN   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...    
10         4.0   NaN   NaN   4.0   NaN   NaN   NaN   NaN   4.0   NaN  ...    
11         NaN   NaN   NaN   NaN   NaN   NaN   NaN   4.0   5.0  

In [5]:
def r(u, i) : 
    return sparse_rating_matrix.values[u][i]

r_val = sparse_rating_matrix.mean().mean()
def r_() : 
    return r_val

r_u_value = sparse_rating_matrix.mean(axis=1).tolist()
def r_u(u) : 
    return r_u_value[u]

r_i_value = sparse_rating_matrix.mean(axis=0).tolist()
def r_i(i) : 
    return r_i_value[i]

N = 0 # the number of ratings that are not NaN
for element in sparse_rating_matrix.values :
    for e in element : 
        if(not np.isnan(e)) : 
            N += 1

Equation 1

In [6]:
t1 = 0
t2 = 0
t3 = 0
t4 = 0
for u in range(len(sparse_rating_matrix.values)) : 
    for i in range(len(sparse_rating_matrix.values[u])) : 
        t1_ = r(u,i)*r_() 
        t2_ = r_()**2
        t3_ = r_u(u)*r_()
        t4_ = r_i(i)*r_()
        if not np.isnan(t1_) : 
            t1 += t1_
        if not np.isnan(t2_) : 
            t2 += t2_
        if not np.isnan(t3_) : 
            t3 += t3_
        if not np.isnan(t4_) : 
            t4 += t4_
t1 = t1 / N
t2 = t2 / N
t3 = t3 / N
t4 = t4 / N
equation1 = np.array([t2, t3, t4])
val1 = t1
print(equation1)
print(val1)

[ 164.71078179  192.69022191  164.51493068]
10.806844922


Equation 2

In [7]:
t1 = 0
t2 = 0
t3 = 0
t4 = 0
for u in range(len(sparse_rating_matrix.values)) : 
    for i in range(len(sparse_rating_matrix.values[u])) : 
        t1_ = r(u,i)*r_u(u) 
        t2_ = r_()*r_u(u)
        t3_ = r_u(u)**2
        t4_ = r_i(i)*r_u(u)
        if not np.isnan(t1_) : 
            t1 += t1_
        if not np.isnan(t2_) : 
            t2 += t2_
        if not np.isnan(t3_) : 
            t3 += t3_
        if not np.isnan(t4_) : 
            t4 += t4_
t1 = t1 / N
t2 = t2 / N
t3 = t3 / N
t4 = t4 / N
equation2 = np.array([t2, t3, t4])
val2 = t1
print(equation2)
print(val2)

[ 192.69022191  229.0078886   192.46110155]
12.6226665549


Equation 3

In [8]:
t1 = 0
t2 = 0
t3 = 0
t4 = 0
for u in range(len(sparse_rating_matrix.values)) : 
    for i in range(len(sparse_rating_matrix.values[u])) : 
        t1_ = r(u,i)*r_i(i) 
        t2_ = r_()*r_i(i)
        t3_ = r_u(u)*r_i(i)
        t4_ = r_i(i)**2
        if not np.isnan(t1_) : 
            t1 += t1_
        if not np.isnan(t2_) : 
            t2 += t2_
        if not np.isnan(t3_) : 
            t3 += t3_
        if not np.isnan(t4_) : 
            t4 += t4_
t1 = t1 / N
t2 = t2 / N
t3 = t3 / N
t4 = t4 / N
equation3 = np.array([t2, t3, t4])
val3 = t1
print(equation3)
print(val3)

[ 164.51493068  192.46110155  175.428263  ]
12.6917804296


In [9]:
def square_error() :
    err = 0
    for u in range(len(sparse_rating_matrix.values)) : 
        for i in range(len(sparse_rating_matrix.values[u])) : 
            t1_ = r(u,i) 
            t2_ = alpha*r_()
            t3_ = beta*r_u(u)
            t4_ = gamma*r_i(i)
            temp_error = 0
            if not np.isnan(t1_) : 
                 temp_error += t1_
            if not np.isnan(t2_) : 
                temp_error -= t2_
            if not np.isnan(t3_) : 
                temp_error -= t3_
            if not np.isnan(t4_) : 
                temp_error -= t4_
            err += temp_error**2
    err = err / N
    return err

Checking Squared error for different configuration of alpha, beta, gamma

In [10]:
alpha, beta, gamma = np.linalg.solve(np.array([equation1, equation2, equation3]), np.array([val1, val2, val3]))
print(alpha, beta, gamma)
print(square_error())

alpha = 0.8144
beta = 0.8700
gamma = -0.6485
print(alpha, beta, gamma)
print(square_error())

alpha = 0.333
beta = 0.333
gamma = 0.333
print(alpha, beta, gamma)
print(square_error())

(-0.098514020763096941, -0.0055610667228073382, 0.1708339137649959)
12.6520161854
(0.8144, 0.87, -0.6485)
229.046152847
(0.333, 0.333, 0.333)
174.644617869


In [15]:
alpha, beta, gamma = np.linalg.solve(np.array([equation1, equation2, equation3]), np.array([val1, val2, val3]))
r_tilda = ratings = np.zeros((n_users, n_movies))
r_tilda[r_tilda == 0] = np.nan
for u in range(len(sparse_rating_matrix.values)) : 
    for i in range(len(sparse_rating_matrix.values[u])) : 
        t1_ = r(u,i) 
        t2_ = alpha*r_()
        t3_ = beta*r_u(u)
        t4_ = gamma*r_i(i)
        if(np.isnan(t1_) or np.isnan(t2_) or np.isnan(t3) or np.isnan(t4)) : 
            r_tilda[u][i] = np.nan
            continue 
        r_tilda[u][i] = t1_ - t2_ - t3_ - t4_
            
print(r_tilda)     

[[ 4.66279224  2.7757727   3.79960806 ...,         nan         nan
          nan]
 [ 3.66360942         nan         nan ...,         nan         nan
          nan]
 [        nan         nan         nan ...,         nan         nan
          nan]
 ..., 
 [ 4.66407285         nan         nan ...,         nan         nan
          nan]
 [        nan         nan         nan ...,         nan         nan
          nan]
 [        nan  4.77460127         nan ...,         nan         nan
          nan]]
