In [143]:
import sklearn
import numpy as np
import pandas as pd
import time
np.set_printoptions(edgeitems=10, linewidth=100)

In [4]:
train_data = pd.read_csv('./data/small_train_data.csv', index_col=0)
test_data = pd.read_csv('./data/small_test_data.csv', index_col=0)

In [6]:
len(train_data.userId.unique()), len(test_data.userId.unique())

(671, 671)

In [19]:
# get list of users and movies
users = list(set(train_data.userId) | set(test_data.userId))
movies = list(set(train_data.movieId) | set(test_data.movieId))
num_users = len(users)
num_movies = len(movies)
user2idx = {user: idx for (idx, user) in enumerate(users)}
movie2idx = {movie: idx for (idx, movie) in enumerate(movies)}

In [43]:
# construct matrix
R_mat = np.zeros((num_users, num_movies))
for row in train_data.iterrows():
    R_mat[user2idx[row[1]['userId']], movie2idx[row[1]['movieId']]] = row[1]['rating']

In [68]:
R_mat

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  0.,  0., ...,  0.,  0.,  0.]])

In [46]:
sparsity = float(len(R_mat.nonzero()[0]))
sparsity /= (R_mat.shape[0] * R_mat.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.47%


In [66]:
# 0-1 weight as indicator of missing value
weight_mat = R_mat>0
weight_mat[weight_mat == True] = 1.0
weight_mat[weight_mat == False] = 0.0
weight_mat = weight_mat.astype(float, copy=False)

In [67]:
weight_mat

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.]])

In [151]:
test_weight_mat = np.zeros((num_users, num_movies))
for row in test_data.iterrows():
    test_weight_mat[user2idx[row[1]['userId']], movie2idx[row[1]['movieId']]] = 1.0

In [236]:
lambda_ = 0.
n_factors = 150
n_iterations = 10

In [237]:
user_vec = np.random.rand(num_users, n_factors) 
movie_vec = np.random.rand(n_factors, num_movies)

In [238]:
def get_error(Q, X, Y, W):
    diff = W * (Q - np.dot(X, Y))
    return np.sqrt(np.sum(diff**2)/(len(W.nonzero()[0])))

In [243]:
movie_vec.dot(movie_vec.T).shape

(150, 150)

In [242]:
n_factors

150

In [258]:
errors = []
for ii in range(n_iterations):
    user_vec = np.linalg.solve(np.dot(movie_vec, movie_vec.T) + lambda_ * np.eye(n_factors), 
                        np.dot(movie_vec, R_mat.T)).T
    movie_vec = np.linalg.solve(np.dot(user_vec.T, user_vec) + lambda_ * np.eye(n_factors),
                        np.dot(user_vec.T, R_mat))
    if ii % 100 == 0:
        print('{}th iteration is completed'.format(ii))
    errors.append(get_error(R_mat, user_vec, movie_vec, weight_mat))
R_hat = np.dot(user_vec, movie_vec)

0th iteration is completed


In [259]:
print('Training error of rated movies: {}'.format(get_error(R_mat, X, Y, weight_mat)))
print('Test error of rated movies: {}'.format(get_error(R_mat, X, Y, test_weight_mat)))

Training error of rated movies: 936.9149738625559
Test error of rated movies: 940.1520150134185


In [260]:
test_mat = Q_hat*test_weight_mat
test_pred = []
for row in test_data.iterrows():
    test_pred.append(test_mat[user2idx[row[1]['userId']], movie2idx[row[1]['movieId']]])

In [219]:
train_mat = Q_hat*weight_mat
train_pred = []
for row in train_data.iterrows():
    train_pred.append(train_mat[user2idx[row[1]['userId']], movie2idx[row[1]['movieId']]])

In [220]:
np.sqrt(np.sum((test_pred - test_data.rating.as_matrix())**2)/len(test_pred))

3.456221546036252

In [221]:
np.sqrt(np.sum((train_pred - train_data.rating.as_matrix())**2)/len(train_pred))

1.3873762833757652

In [263]:
user_vec[0,:].dot(movie_vec[:,movie2idx[2294]])

0.27830353727323637

In [251]:
user_vec.shape

(671, 150)

In [262]:
movie_vec.shape

(150, 9066)

In [303]:
import ALSmodel
import importlib
importlib.reload(ALSmodel)

<module 'ALSmodel' from '/Users/timhsu/Documents/COURSES/DS1003/project/ALSmodel.py'>

In [304]:
model = ALSmodel.ALSRecommender(max_iter=20, n_factors=50)

In [305]:
model.prepare(train_data, test_data)

ALSRecommender(item_col='movieId', lambda_reg=1, max_iter=20, n_factors=50,
        rating_col='rating', user_col='userId')

In [306]:
model.fit(100)

0th iteration
10th iteration
20th iteration
30th iteration
40th iteration
50th iteration
60th iteration
70th iteration
80th iteration
90th iteration


ALSRecommender(item_col='movieId', lambda_reg=1, max_iter=20, n_factors=50,
        rating_col='rating', user_col='userId')

In [310]:
predictions = model.predict(test_data)

In [316]:
np.sqrt(sum((predictions - test_data.rating.as_matrix())**2)/len(predictions))

3.2500615615227559