In [1]:
import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



In [2]:
header = ['item_id', 'rating', 'user_id']
df = pd.read_csv('./csv/Pre_100k_2.csv', sep = ',', names = header)

In [3]:
df

Unnamed: 0,item_id,rating,user_id
0,1121,2.0,6765
1,1121,5.0,6328
2,1121,5.0,3245
3,473,5.0,5313
4,473,5.0,2916
5,473,5.0,641
6,473,5.0,1898
7,473,5.0,576
8,473,2.0,7783
9,473,4.0,2194


In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)

Number of users = 8019 | Number of movies = 2940


In [5]:
data_matrix = np.zeros((n_users, n_items))
for line in df.itertuples():
    data_matrix[line[3] - 1, line[1] - 1] = line[2]

In [9]:
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [10]:
def predict(ratings, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [11]:
item_prediction = predict(data_matrix, item_similarity, type = 'item')
user_prediction = predict(data_matrix, user_similarity, type = 'user')

In [12]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [14]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, data_matrix))

User-based CF RMSE: 4.35870295832
Item-based CF RMSE: 4.35904720108


In [15]:
sparsity=round(1.0-len(df)/float(n_users * n_items),3)
print 'The sparsity level of 100k is ' +  str(sparsity*100) + '%'

The sparsity level of 100k is 100.0%


In [18]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, data_matrix))

User-based CF MSE: 4.00297378845


In [19]:
user_prediction

array([[ 0.00073695,  0.00086169,  0.00036275, ...,  0.00086169,
         0.00086169,  0.00086169],
       [ 0.00073686,  0.00086158,  0.0003627 , ...,  0.00086158,
         0.00086158,  0.00086158],
       [-0.00062333, -0.00049848, -0.00099786, ..., -0.00049848,
        -0.00049848, -0.00049848],
       ..., 
       [ 0.0007397 ,  0.00086505,  0.00036367, ...,  0.00086505,
         0.00086505,  0.00086505],
       [-0.00062384, -0.00049911, -0.00099805, ..., -0.00049911,
        -0.00049911, -0.00049911],
       [ 0.00073751,  0.00086236,  0.00036298, ...,  0.00086236,
         0.00086236,  0.00086236]])