## Recommendation Engines (Memory Based and Content Based Collaborating)

#### Importing Users, Ratings and Items(movies) data

In [2]:
import pandas as pd
import numpy as np
#from surprise import Dataset, evaluate



In [3]:
# Pipe separated files without headers, headers in readme file

#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

In [4]:
print(users.shape)
ratings.head()
#users['user_id'].nunique()

(943, 5)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
print(ratings.shape)
n_users = ratings["user_id"].nunique() 
n_items = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ', Number of movies = ' + str(n_items))

(100000, 4)
Number of users = 943, Number of movies = 1682


In [9]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(ratings, test_size = 0.25)
train_data.shape
test_data.shape

(25000, 4)

# Memory Based Collaborative Filtering

### User - Item Matrix

In [20]:
# On training data
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
# On test data
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix.shape

(943, 1682)

### Similarity Matrix

In [43]:
#The similarity values between items in Item-Item Collaborative Filtering are measured by 
    #observing all the users who have rated both items.
#For User-Item Collaborative Filtering the similarity values between users are measured by 
    #observing all the items that are rated by both users.

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric = 'cosine' )
user_similarity.shape,item_similarity

((943, 943), array([[ 0.        ,  0.7066502 ,  0.74796731, ...,  1.        ,
          0.94603457,  0.94603457],
        [ 0.7066502 ,  0.        ,  0.75997785, ...,  1.        ,
          0.91161165,  0.91161165],
        [ 0.74796731,  0.75997785,  0.        , ...,  1.        ,
          1.        ,  1.        ],
        ..., 
        [ 1.        ,  1.        ,  1.        , ...,  0.        ,
          1.        ,  1.        ],
        [ 0.94603457,  0.91161165,  1.        , ...,  1.        ,
          0.        ,  1.        ],
        [ 0.94603457,  0.91161165,  1.        , ...,  1.        ,
          1.        ,  0.        ]]))

In [39]:
def predict(data_set, similarity, type ):
    if type == 'user':
        mean_user_rating = data_set.mean(axis = 1)
        ratings_diff = (data_set - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = data_set.dot(similarity)/np.array([np.abs(similarity).sum(axis = 1)])
    return pred



### Predictions

In [42]:
user_pred = predict(train_data_matrix, user_similarity, type = 'user')
item_pred = predict(train_data_matrix, item_similarity, type = 'item')

item_pred

array([[ 0.3781726 ,  0.39392969,  0.4089755 , ...,  0.46124592,
         0.45154028,  0.44758764],
       [ 0.08783472,  0.1023545 ,  0.09777779, ...,  0.10245516,
         0.10378713,  0.10421821],
       [ 0.06094242,  0.06497983,  0.0622878 , ...,  0.05996565,
         0.06315453,  0.06416884],
       ..., 
       [ 0.03184742,  0.04138803,  0.03992477, ...,  0.04706634,
         0.04597135,  0.04669642],
       [ 0.12351107,  0.1302579 ,  0.13831144, ...,  0.14620302,
         0.14525661,  0.14628238],
       [ 0.21146278,  0.20873026,  0.22595254, ...,  0.26277553,
         0.25308964,  0.25461364]])

### Evaluations - RMSE

In [34]:
# Filtering all other elements in the prediction matrix with prediction[ground_truth.nonzero()], to consider only predicted ratings that are in test dataset alone
from sklearn.metrics import mean_squared_error 
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [50]:
print('User-based CF RMSE: ' + str(rmse(user_pred, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_pred, test_data_matrix)))
user_pred, test_data_matrix


User-based CF RMSE: 3.109797426127411
Item-based CF RMSE: 3.434313183304934


(array([[ 1.62312468,  0.60043021,  0.51806797, ...,  0.30973912,
          0.30965671,  0.30950072],
        [ 1.34760164,  0.30591746,  0.17625981, ..., -0.06408924,
         -0.06277564, -0.06262515],
        [ 1.34846586,  0.25698775,  0.14064516, ..., -0.10458397,
         -0.10287704, -0.10268272],
        ..., 
        [ 1.22062805,  0.23285858,  0.11118711, ..., -0.11819223,
         -0.11742158, -0.11719318],
        [ 1.38268065,  0.31777384,  0.22353008, ..., -0.01272153,
         -0.011932  , -0.01168851],
        [ 1.44555524,  0.40023948,  0.32053333, ...,  0.10972483,
          0.10963908,  0.10982778]]),
 array([[ 0.,  3.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  5.,  0., ...,  0.,  0.,  0.]]))

## Model Based Collaborative Filtering - Matrix Factorization

In [74]:
# calculating sparsity level on movieLens data

sparsity = round(1- len(ratings)/(float(n_users*n_items)), 3)
print ('sparsity of MovieLens data is ' + str(sparsity*100) + "%")

sparsity of MovieLens data is 93.7%


In [88]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix, choose k

u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix = np.diag(s)
x_pred = np.dot(np.dot(u, s_diag_matrix), vt)
x_pred.shape

print('User based CF RMSE: ' + str(rmse(x_pred, test_data_matrix)))

User based CF RMSE: 2.7137187913810057


In [93]:
x_pred

array([[  3.42880104e+00,   1.80118337e+00,   1.01827999e+00, ...,
         -6.47669137e-03,   2.22787709e-02,   7.74222629e-02],
       [  1.47574537e+00,  -2.37259905e-01,   1.18014061e-01, ...,
          1.34555966e-02,  -2.71754892e-03,  -1.45638592e-02],
       [  2.77339709e-01,  -6.51713826e-02,   1.14309958e-01, ...,
          1.74621150e-02,   7.40767355e-03,  -3.88694240e-03],
       ..., 
       [  2.10338322e+00,   9.72620199e-02,   3.58678032e-01, ...,
         -2.16838011e-03,   6.58243393e-03,   3.07306869e-03],
       [  8.82233021e-01,   2.07283952e-01,  -2.77358881e-01, ...,
          9.00223341e-03,   4.07239495e-03,  -4.19411370e-02],
       [  1.50992730e+00,   1.57594725e+00,   1.02041739e+00, ...,
         -7.36682314e-03,   2.23404788e-02,   3.09707645e-02]])

#### References

https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html

https://github.com/ogerhsou/Youtube-Recommendation-Tensorflow/blob/master/youtube_recommendation.py

https://github.com/robi56/Deep-Learning-for-Recommendation-Systems