In [1]:
import os 
import sys
src_dir = os.path.join(os.path.abspath('..'),'src/')
sys.path.append(src_dir)

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import time
from common.preprocess.data_process import generate_userByItem, split, load_movies
from models.als.model import implicit_als, implicit_als_cg
from models.als.model_jit import implicit_als_cg_jit
from common.serving.recommder_als import recommend, real_watched_movie
from evaluation.evaluate import MAP, precision

## Preprocess data

In [3]:
full_sparse = generate_userByItem('/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/data/ratings.csv', \
                                  ['userId', 'movieId', 'rating'], dense =True, info =True)

Loading and pivoting ratings data...
Maximum number of movies watched by a user: 2698
Minimum number of movies watched by a user: 20
Done


In [4]:
train_sparse, test_sparse = split(full_sparse, dense = True)

Assign 10 movies for each user to test and the rest to train
Splitting data into train and test...
Done


In [5]:
movies, movieId_lookup = load_movies('/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/data/movies.csv')

Loading and preprocessing movies data...
Generating lookup table for movie index...
Done


## Model

### ALS without conjugate gradient

In [50]:
start = time.time()
user_vecs, item_vecs = implicit_als(train_sparse, alpha_val=15, iterations=20, lambda_val=0.1, features=20)
end = time.time()
print('training time without conjugate gradient: {}s'.format(end-start))

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20
training time without conjugate gradient: 470.45930099487305s


In [59]:
## Save models
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users.npz", user_vecs)
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies.npz", item_vecs)

In [6]:
## reload models
user_vecs = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users.npz")
item_vecs = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies.npz")

### Recommendation

In [7]:
# get rec and true
rec_arr = recommend(train_sparse, user_vecs, item_vecs, num_items = 10)
true_arr = real_watched_movie(test_sparse)

In [8]:
p = precision(true_arr, rec_arr,10)

The average precision: 0.113


In [9]:
m = MAP(true_arr, rec_arr,10)

The mean average prevision(MAP): 0.073


### ALS with Conjugate Gradient

In [92]:
start = time.time()
user_vecs_cg, item_vecs_cg = implicit_als_cg(train_sparse, alpha_val=15, iterations=20, lambda_val=0.1, features=20)
end = time.time()
print('training time with conjugate gradient: {}s'.format(end-start))

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20
training time with conjugate gradient: 73.73181581497192s


In [94]:
## Save models
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users_cg.npz", user_vecs_cg)
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies_cg.npz", item_vecs_cg)

In [10]:
## Reload models
user_vecs_cg = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users_cg.npz")
item_vecs_cg = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies_cg.npz")

### Recommendation

In [11]:
# get rec from cg als
rec_arr_cg = recommend(train_sparse, user_vecs_cg, item_vecs_cg, num_items = 10)

In [12]:
p_cg = precision(true_arr, rec_arr_cg, 10)

The average precision: 0.118


0.1180327868852459

In [13]:
m_cg = MAP(true_arr, rec_arr_cg,10)

The mean average prevision(MAP): 0.081


0.08086137132448606

## ALS using Conjugate Gradient with Numba JIT

In [6]:
start = time.time()
user_vecs_cg_jit, item_vecs_cg_jit = implicit_als_cg_jit(train_sparse.toarray(), alpha_val=15, iterations=20, lambda_val=0.1, features=20)
end = time.time()
print('Training time of ALS using conjugate gradient with Numba JIT optimization: {}s'.format(end-start))

Training time of ALS using conjugate gradient with Numba JIT optimization: 10.04761791229248s


In [9]:
## Save models
# u = sparse.csr_matrix(user_vecs_cg_jit)
# m = sparse.csr_matrix(item_vecs_cg_jit)
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users_cg_jit.npz", u)
# sparse.save_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies_cg_jit.npz", m)

In [11]:
user_vecs_cg_jit = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/users_cg_jit.npz")
item_vecs_cg_jit = sparse.load_npz("/Users/zihaoguo/NYU/ADPY/DS-GA-3001-Advanced-Python/src/models/als/saved_models/movies_cg_jit.npz")

In [12]:
# get rec and true
rec_arr_cg_jit = recommend(train_sparse, sparse.csr_matrix(user_vecs_cg_jit), sparse.csr_matrix(item_vecs_cg_jit), num_items = 10)
true_arr_cg_jit = real_watched_movie(test_sparse)

In [13]:
p_cg_jit = precision(true_arr_cg_jit, rec_arr_cg_jit, 10)
map_cg_jit = MAP(true_arr_cg_jit, rec_arr_cg_jit,10)

The average precision: 0.118
The mean average prevision(MAP): 0.081


## ALS using Conjugate Gradient Gradient with Cython Implementation

In [6]:
%reload_ext Cython

In [7]:
%%cython
import numpy as np
cimport numpy as np

## Cython implementation of ALS
def implicit_als_cg_cython(np.ndarray[double, ndim = 2] data, int alpha_val = 15, \
                           int iterations=20, double lambda_val=0.1, int features=20):
    '''
    data is numpy array
    return recomendation vector
    '''
    cdef np.ndarray[double, ndim = 2] Cui = data * alpha_val
    cdef int user_size = Cui.shape[0]
    cdef int item_size = Cui.shape[1]
    
    np.random.seed(123)
    cdef np.ndarray[double, ndim = 2] X = np.random.rand(user_size, features) * 0.01
    cdef np.ndarray[double, ndim = 2] Y = np.random.rand(item_size, features) * 0.01

    cdef np.ndarray[double, ndim = 2] Ciu = Cui.T

    for iteration in range(iterations):
        ### print not working in numba compilier
        print('iteration {} of {}'.format(iteration+1, iterations))
        least_squares_cg(Cui, X, Y, lambda_val)
        least_squares_cg(Ciu, Y, X, lambda_val)
    
    return X, Y

##### helper functiions for implicit_als_cg
def nonzeros(np.ndarray[double, ndim = 2] m, int row):
    cdef np.ndarray items_user = m[row]
    cdef np.ndarray idx = items_user.nonzero()[0]
    return np.stack((idx, items_user[idx]), 1)
        
def least_squares_cg(np.ndarray[double, ndim = 2] Cui, np.ndarray[double, ndim = 2] X, \
                     np.ndarray[double, ndim = 2] Y, double lambda_val, int cg_steps=3):
    cdef int users = X.shape[0]
    cdef int features = X.shape[1]

    cdef np.ndarray[double, ndim = 2] YtY = Y.T.dot(Y) + lambda_val * np.eye(features)
    
    # stating the type of the variables in for-loops allows 
    # for a more optimized conversion to a C loop
    cdef int u
    cdef int it
    
    ## type of varaibles in the loop
    cdef np.ndarray x
    cdef np.ndarray r
    cdef int i
    cdef double confidence
    cdef np.ndarray p
    cdef double rsold
    cdef np.ndarray Ap
    cdef double alpha
    cdef double rsnew
    
    for u in range(users):
        
        x = X[u]
        r = -YtY.dot(x)
        
        for i, confidence in nonzeros(Cui, u):
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]
        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            Ap = YtY.dot(p)
            
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]
            
            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap

            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

In [8]:
start = time.time()
user_vecs_cg_cython, item_vecs_cg_cython = implicit_als_cg_cython(train_sparse.toarray(), alpha_val=15, iterations=20, lambda_val=0.1, features=20)
end = time.time()
print('Training time of ALS using conjugate gradient with Cython optimization: {}s'.format(end-start))

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20
Training time of ALS using conjugate gradient with Cython optimization: 82.66664981842041s


In [9]:
# get rec and true
rec_arr_cg_cython = recommend(train_sparse, sparse.csr_matrix(user_vecs_cg_cython), sparse.csr_matrix(item_vecs_cg_cython), num_items = 10)
true_arr_cg_cython = real_watched_movie(test_sparse)

In [10]:
p_cg_cython = precision(true_arr_cg_cython, rec_arr_cg_cython, 10)
map_cg_cython = MAP(true_arr_cg_cython, rec_arr_cg_cython,10)

The average precision: 0.118
The mean average prevision(MAP): 0.081


## Examine recommendation for user 123 using 2 ALS algorithms

In [32]:
## check the recomended movies for user 123 using basic ALS
movies_rec_idx = [movieId_lookup[r] for r in rec_arr[123,:]]
movies.loc[movies['movieId'].isin(movies_rec_idx)]

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
461,527,Schindler's List (1993),Drama|War
613,778,Trainspotting (1996),Comedy|Crime|Drama
659,858,"Godfather, The (1972)",Crime|Drama
1283,1703,For Richer or Poorer (1997),Comedy
1290,1719,"Sweet Hereafter, The (1997)",Drama
1502,2027,Jane Austen's Mafia! (1998),Comedy|Crime
1938,2570,"Walk on the Moon, A (1999)",Drama|Romance
2077,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
2224,2953,Home Alone 2: Lost in New York (1992),Children|Comedy


In [33]:
## check the recomended movies for user 123 using imporved ALS
movies_rec_idx_cg = [movieId_lookup[r] for r in rec_arr_cg[123,:]]
movies.loc[movies['movieId'].isin(movies_rec_idx_cg)]

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
461,527,Schindler's List (1993),Drama|War
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
613,778,Trainspotting (1996),Comedy|Crime|Drama
659,858,"Godfather, The (1972)",Crime|Drama
1283,1703,For Richer or Poorer (1997),Comedy
1502,2027,Jane Austen's Mafia! (1998),Comedy|Crime
1938,2570,"Walk on the Moon, A (1999)",Drama|Romance
2077,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
2224,2953,Home Alone 2: Lost in New York (1992),Children|Comedy


In [34]:
## check the real movies watched by user 123 in test set
movies_real_idx = [movieId_lookup[r] for r in true_arr[123,:]]
movies.loc[movies['movieId'].isin(movies_real_idx)]

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
455,520,Robin Hood: Men in Tights (1993),Comedy
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
830,1091,Weekend at Bernie's (1989),Comedy
2013,2682,Limbo (1999),Drama
2370,3145,Cradle Will Rock (1999),Drama
2448,3261,Singles (1992),Comedy|Drama|Romance
2492,3326,What Planet Are You From? (2000),Comedy|Sci-Fi
3970,5601,"Yearling, The (1946)",Children|Drama
7374,79139,"Sorcerer's Apprentice, The (2010)",Action|Adventure|Children|Comedy|Fantasy
