In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('E:\\UC CS\\Machine Learning\\Recommender Systems\\ml-latest-small\\ml-latest-small\\movies.csv')

In [3]:
ratings = pd.read_csv('E:\\UC CS\\Machine Learning\\Recommender Systems\\ml-latest-small\\ml-latest-small\\ratings.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [6]:
mergedFrame = pd.merge(ratings, movies, on = 'movieId')

In [7]:
mergedFrame

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama
5,39,31,3.0,832525157,Dangerous Minds (1995),Drama
6,73,31,3.5,1255591860,Dangerous Minds (1995),Drama
7,88,31,3.0,1239755559,Dangerous Minds (1995),Drama
8,96,31,2.5,1223256331,Dangerous Minds (1995),Drama
9,110,31,4.0,840100695,Dangerous Minds (1995),Drama


In [8]:
len(mergedFrame)

100004

In [9]:
n_users = mergedFrame.userId.nunique()
n_items = mergedFrame.movieId.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 671
Num of Movies: 9066


In [10]:
aliasDict = {}
counter = 1
for movieID in mergedFrame['movieId']:
    if movieID not in aliasDict:
        aliasDict[movieID] = counter
        counter += 1

In [12]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(mergedFrame, test_size=0.25)

# Memory-Based Collaborative Filtering

In [22]:
#Create two user-item matrices, one for training and another for testing
import numpy as np
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, aliasDict[line[2]]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, aliasDict[line[2]]-1] = line[3]

In [24]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [25]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [26]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [38]:
test_data_matrix.nonzero()

(array([  0,   0,   0, ..., 670, 670, 670], dtype=int64),
 array([   7,    9,   12, ..., 2252, 4823, 7005], dtype=int64))

In [30]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [31]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.3674038720715243
Item-based CF RMSE: 3.581451167101445


# Model-based Collaborative Filtering 

In [33]:
sparsity=round(1.0-len(mergedFrame)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 98.4%


In [34]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 3.091095869143205
