In [2]:
# constants
GENRE_FEATURE_WEIGHT = 41

In [3]:
# imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import os
import random
import scipy

# dataset root
root_dataset_folder = os.path.join('..', 'data', 'ml-100k')

# u.user (Users)
user_columns = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv(os.path.join(root_dataset_folder, 'u.user'), sep='|', names=user_columns, encoding='latin-1')

# u.items (Movies)
movie_columns = [
    'movie_id', 
    'movie_title' ,
    'release_date',
    'video_release_date', 
    'IMDb_URL', 
    'unknown', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children\'s', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War',
    'Western'
]
movies = pd.read_csv(os.path.join(root_dataset_folder, 'u.item'), sep='|', names=movie_columns, encoding='latin-1')
genre_matrix = movies.drop('movie_id', 1).drop('movie_title', 1).drop('release_date', 1).drop('video_release_date', 1).drop('IMDb_URL', 1).drop('unknown', 1)

In [4]:
# Expects numpy array as input
# Returns normalized numpy array
def multiplicative_normalization(genre_matrix):
    variable_genre_sum = genre_matrix.sum(axis=1)
    variable_movies_sum = genre_matrix.sum(axis=0)
    temp_1 = 1 / np.sqrt(variable_movies_sum) * genre_matrix
    movie_genre_normalized = 1 / np.sqrt(variable_genre_sum) * temp_1.transpose()
    return movie_genre_normalized*GENRE_FEATURE_WEIGHT
    
genre_matrix = multiplicative_normalization(genre_matrix)
print(genre_matrix)

                 0         1         2         3         4         5     \
Action       0.000000  1.494123  0.000000  1.494123  0.000000  0.000000   
Adventure    0.000000  2.037306  0.000000  0.000000  0.000000  0.000000   
Animation    3.652570  0.000000  0.000000  0.000000  0.000000  0.000000   
Children's   2.143104  0.000000  0.000000  0.000000  0.000000  0.000000   
Comedy       1.053362  0.000000  0.000000  1.053362  0.000000  0.000000   
Crime        0.000000  0.000000  0.000000  0.000000  2.267305  0.000000   
Documentary  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Drama        0.000000  0.000000  0.000000  0.879132  0.879132  1.522702   
Fantasy      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Film-Noir    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Horror       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Musical      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Mystery      0.000000  0.

In [5]:
# imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import os
import random

# dataset root
root_dataset_folder = os.path.join('..', 'data', 'ml-100k')

# u.user (Users)
user_columns = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv(os.path.join(root_dataset_folder, 'u.user'), sep='|', names=user_columns, encoding='latin-1')

# u.items (Movies)
movie_columns = [
    'movie_id', 
    'movie_title' ,
    'release_date',
    'video_release_date', 
    'IMDb_URL', 
    'unknown', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children\'s', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War',
    'Western'
]
movies = pd.read_csv(os.path.join(root_dataset_folder, 'u.item'), sep='|', names=movie_columns, encoding='latin-1')

#Creating sparse_rating_matrix:
M = users.user_id.unique().shape[0]
N = movies.movie_id.unique().shape[0]

ratings = np.zeros((M, N))
# reading ratings from 
# (Ratings)
rating_columns = [
    'user_id', 
    'movie_id', 
    'rating', 
    'unix_timestamp'
]
# ua.base
data_base = pd.read_csv(os.path.join(root_dataset_folder, 'ua.base'), sep='\t', names=rating_columns, encoding='latin-1')

for row in data_base.itertuples():
    ratings[row[1]-1 , row[2] -1] = row[3]

sparse_rating_matrix = pd.DataFrame(data=ratings,index=users['user_id'],columns=movies['movie_id'])
sparse_rating_matrix = sparse_rating_matrix.replace('0',np.nan)
sparse_rating_matrix = sparse_rating_matrix.values
print(sparse_rating_matrix.shape)
print(sparse_rating_matrix)

(943, 1682)
[[  5.   3.   4. ...,  nan  nan  nan]
 [  4.  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [  5.  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan   5.  nan ...,  nan  nan  nan]]


In [6]:
normalized_sparse_rating_matrix = sparse_rating_matrix - np.nanmean(sparse_rating_matrix)
normalized_sparse_rating_matrix = pd.DataFrame(normalized_sparse_rating_matrix / np.nanmax(abs(normalized_sparse_rating_matrix)))
print(normalized_sparse_rating_matrix)

         0         1         2         3         4         5         6     \
0    0.584895 -0.207553  0.188671 -0.207553 -0.207553  0.584895  0.188671   
1    0.188671       NaN       NaN       NaN       NaN       NaN       NaN   
2         NaN       NaN       NaN       NaN       NaN       NaN       NaN   
3         NaN       NaN       NaN       NaN       NaN       NaN       NaN   
4         NaN       NaN       NaN       NaN       NaN       NaN       NaN   
5    0.188671       NaN       NaN       NaN       NaN       NaN -0.603776   
6         NaN       NaN       NaN  0.584895       NaN       NaN  0.584895   
7         NaN       NaN       NaN       NaN       NaN       NaN -0.207553   
8         NaN       NaN       NaN       NaN       NaN       NaN  0.188671   
9    0.188671       NaN       NaN  0.188671       NaN       NaN       NaN   
10        NaN       NaN       NaN       NaN       NaN       NaN       NaN   
11        NaN       NaN       NaN  0.584895       NaN       NaN       NaN   

In [7]:
result_final = pd.concat([normalized_sparse_rating_matrix, np.transpose(genre_matrix)], axis=1)
result_final = result_final.replace(np.nan,float(0))
print(result_final)
print(result_final.shape)

             0         1         2         3         4         5         6  \
0     0.584895 -0.207553  0.188671 -0.207553 -0.207553  0.584895  0.188671   
1     0.188671  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5     0.188671  0.000000  0.000000  0.000000  0.000000  0.000000 -0.603776   
6     0.000000  0.000000  0.000000  0.584895  0.000000  0.000000  0.584895   
7     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000 -0.207553   
8     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.188671   
9     0.188671  0.000000  0.000000  0.188671  0.000000  0.000000  0.000000   
10    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
11    0.000000  0.000000  0.000000  0.584895  0.000000  0.000000

In [8]:
u, s, vt = svds(result_final, k=500)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
#print(X_pred)
#print(X_pred.shape)
print(X_pred[1].tolist())

[0.21011465975765417, -0.023842210221211187, -0.03227932409547619, -0.03252567307215197, -0.018267091588704796, -0.036954690845779875, -0.005227005529356656, -0.007032213801545388, 0.024905821780748752, -0.535434349512646, 0.00217621609014853, -0.005729931033782429, 0.004966702903927499, 0.2012031226326051, -0.020344062855405576, -0.003751857089981034, 0.003209002225868996, 0.0027012871264518127, -0.13534440322226193, 0.02876644480723465, 0.011406092643473996, -0.006519726705306086, -0.002357196200115576, 0.010060929667659445, 0.19691428533343208, 0.007355118203961409, 0.018882250080337927, -0.05148054819385075, -0.024814624723764417, 0.00870484153897881, -0.027774590991736476, -0.015366109903960392, -0.006763991445394689, -0.013939809664178971, 0.02341894548969227, 0.010024808554090481, -0.007841803330889519, 0.008046419191145274, -0.01015103207030673, 0.011627146315744435, 0.019984469859791593, -0.004998478847656862, 0.0076650743003139135, 0.027289811084768374, 0.02896982661729785, -