In [1]:
#Memory-Based Collaborative Filtering implementation using MovieLens dataset
import pandas as pd
import numpy as np

In [3]:
columns = ['userId','movieId','rating','timestamp']
filename = '/users/sneha/documents/RecommenderSystems/ml-100k/u.data'
print"reading data.."
df = pd.read_csv(filename, sep = '\t', names = columns)
print"reading complete.."
#print df.shape

reading data..
reading complete..


In [4]:
#curious to know how many users are there, and how many unique movies!!!
n_users = df.userId.unique().shape[0]
n_movies = df.movieId.unique().shape[0]
print "Number of unique users: " + str(n_users)
print "Number of unique movies: " + str(n_movies)

Number of unique users: 943
Number of unique movies: 1682


In [6]:
#split train and test records
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size = 0.25)
print train_data[0:5]

       userId  movieId  rating  timestamp
48181     739       69       5  886959069
51618     211      199       5  879459952
6088      201      357       4  884111217
19546     160      864       1  876770673
96931     474      411       2  887915684


In [7]:
#Create two user-item matrices
user_item_matrix_train = np.zeros((n_users,n_movies))
for line in train_data.itertuples():
    # will contain ratings of users in order of user id
    user_item_matrix_train[line[1]-1, line[2]-1] = line[3]
user_item_matrix_test = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    user_item_matrix_test[line[1]-1, line[2]-1] = line[3]

In [8]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(user_item_matrix_train, metric = 'cosine')
item_similarity = pairwise_distances(user_item_matrix_train.T, metric = 'cosine')
# X = [[5, 2],
#      [1, 3]]
# sim = pairwise_distances(X)


In [9]:
#The idea for USER BASED CF is that some users may tend always to give high or low ratings to all movies.
#Suppose, user k gives 4 stars to his favourite movies and 3 stars to all other good movies. 
#Suppose now that another user t rates movies that he/she likes with 5 stars, and other movies over with 3 stars. 
#These two users could have a very similar taste but treat the "rating system" differently.

#The above processsing is not needed for ITEM BASED CF since query user itself is used to do predictions.
def predict(ratings, similarity, type='user'):
    if type == 'user':
        #Find the mean of ratings - (later used to find non-biased data matrix)
        mean_user_rating = ratings.mean(axis=1)
      
        #center the data by subtracting mean. 
        ##Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        
        #use this unbiased ratings_diff matrix to find the dot product. Then add it to the mean user ratings to receive original matrix alike.
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(user_item_matrix_train, item_similarity, type='item')
user_prediction = predict(user_item_matrix_train, user_similarity, type='user')

In [10]:
#Evaluation metrics - RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, actual):
    #flatten - Returns a copy of the array collapsed into one dimension.
    # nonzero() returns a tuple of arrays, one for each dimension, containing the indices of 
    #the non-zero elements in that dimension.
    prediction = prediction[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

print 'User-based CF RMSE: ' + str(rmse(user_prediction, user_item_matrix_test))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, user_item_matrix_test))

User-based CF RMSE: 3.12795278292
Item-based CF RMSE: 3.45395158892
