In [58]:
import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [59]:
header = ["user_id", "item_id", "rating", "timestamp"]
df = pd.read_csv("ml-100k/u.data", sep="\t", names=header)

print(df.shape)
print(df.head(20))
print(df.describe())
print(df.groupby("rating").size())

(100000, 4)
    user_id  item_id  rating  timestamp
0       196      242       3  881250949
1       186      302       3  891717742
2        22      377       1  878887116
3       244       51       2  880606923
4       166      346       1  886397596
5       298      474       4  884182806
6       115      265       2  881171488
7       253      465       5  891628467
8       305      451       3  886324817
9         6       86       3  883603013
10       62      257       2  879372434
11      286     1014       5  879781125
12      200      222       5  876042340
13      210       40       3  891035994
14      224       29       3  888104457
15      303      785       3  879485318
16      122      387       5  879270459
17      194      274       2  879539794
18      291     1042       4  874834944
19      234     1184       2  892079237
            user_id        item_id         rating     timestamp
count  100000.00000  100000.000000  100000.000000  1.000000e+05
mean      462.48475 

In [60]:
users = df.user_id.unique().shape[0]
items = df.item_id.unique().shape[0]
print ("Number of users = " + str(n_users) +  "| Number of movies = " + str(n_items))

Number of users = 943| Number of movies = 1682


In [61]:
# memory based collaborative filtering
train_data, test_data = cv.train_test_split(df, test_size = 0.25)

#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
    
user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")


In [62]:
def predict(ratings, similarity, type="user"):
    if type == "user":
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == "item":
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(test_data_matrix, item_similarity, type="item")
user_prediction = predict(test_data_matrix, user_similarity, type="user")


[[0.12375953 0.1315311  0.13650524 ... 0.15511266 0.15407496 0.15021334]
 [0.02980407 0.03473661 0.03360406 ... 0.03587864 0.03628792 0.0364258 ]
 [0.02255015 0.0233308  0.02233456 ... 0.02123372 0.02260559 0.02282947]
 ...
 [0.01042492 0.0144176  0.0140885  ... 0.01603315 0.01606187 0.01617702]
 [0.02970731 0.03100654 0.0336486  ... 0.03548763 0.03569304 0.03542818]
 [0.071809   0.07143164 0.07610416 ... 0.08813239 0.08744795 0.08466378]]
[[ 0.51763911  0.18159406  0.15877603 ...  0.1034872   0.1058924
   0.1034872 ]
 [ 0.44495142  0.07955645  0.04436222 ... -0.02017859 -0.01689813
  -0.02017859]
 [ 0.43585504  0.06580768  0.03321059 ... -0.03396229 -0.03081489
  -0.03396229]
 ...
 [ 0.39975812  0.05696732  0.02067821 ... -0.0392717  -0.03626905
  -0.0392717 ]
 [ 0.42773433  0.07105965  0.04679214 ... -0.01790008 -0.01515188
  -0.01790008]
 [ 0.45685076  0.11478469  0.0940757  ...  0.03639404  0.0387077
   0.03639404]]


In [63]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ("User-based CF RMSE: " + str(rmse(user_prediction, test_data_matrix)))
print ("Item-based CF RMSE: " + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.5007936265197896
Item-based CF RMSE: 3.6153128696669103


In [64]:
#Model based collaborative filtering 
sparsity = round(1.0-len(df)/ float(users*items),3)
sparseper = sparsity*100
print("the sparsity level of Movielens100K data is {0}% ".format(sparseper))

the sparsity level of Movielens100K data is 93.7% 


In [66]:
#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print("User-based CF MSE: " + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7198631029235094
