To Build a Movie recommender using 100k Dataset from 100 users on 1700 movies

In [2]:
#Necessary Imports for further use 

import pandas as pd
import numpy as np
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



In [13]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
#Reading data file u.data
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,
 encoding='latin-1')


num_users = ratings.user_id.unique().shape[0]
num_items = ratings.movie_id.unique().shape[0]
print(num_users) 
print(num_items)

943
1682


In [15]:
#Spliting the data into training and testing
training_ratings_data, testing_ratings_data = cv.train_test_split(ratings, test_size=0.25)

print("Training Ratings Data - 75%")
print(training_ratings_data.shape)
print(training_ratings_data.head())
print("Testing Ratings Data - 25%")
print(testing_ratings_data.shape)
print(testing_ratings_data.head())

Training Ratings Data - 75%
(75000, 4)
       user_id  movie_id  rating  unix_timestamp
88905      751        90       3       889298528
58231      700       168       3       884494420
74853      234       119       3       892335261
9858       288       300       5       886372155
17441      320       895       4       884748346
Testing Ratings Data - 25%
(25000, 4)
       user_id  movie_id  rating  unix_timestamp
94367      532       127       5       893119438
32954      456      1324       4       881371720
93323      912       418       4       875966694
35574      474        87       4       887925916
21770       65       365       3       879216672


In [16]:
#Building a USer-Item Matrix

training_ratings_matrix = np.zeros((num_users, num_items))

for row in training_ratings_data.itertuples():
    training_ratings_matrix[row[1]-1, row[2]-1] = row[3]

testing_ratings_matrix = np.zeros((num_users, num_items))

for row in testing_ratings_data.itertuples():
    testing_ratings_matrix[row[1]-1, row[2]-1] = row[3]

print("User-Item Matrix")
print(training_ratings_matrix)

User-Item Matrix
[[ 0.  3.  4. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  5.  0. ...,  0.  0.  0.]]


In [17]:
#Using Cosine similarity 

user_similarity = pairwise_distances(training_ratings_matrix, metric='cosine')
item_similarity = pairwise_distances(training_ratings_matrix.T, metric='cosine')
print("User Similarity")
print(user_similarity)
print("Item Similarity")
print(item_similarity)

User Similarity
[[ 0.          0.90481063  0.97274101 ...,  0.88315881  0.84083529
   0.70969974]
 [ 0.90481063  0.          0.90053396 ...,  0.89109211  0.85843482
   0.901996  ]
 [ 0.97274101  0.90053396  0.         ...,  0.94670455  0.88065311  1.        ]
 ..., 
 [ 0.88315881  0.89109211  0.94670455 ...,  0.          0.89789043
   0.91957465]
 [ 0.84083529  0.85843482  0.88065311 ...,  0.89789043  0.          0.85609458]
 [ 0.70969974  0.901996    1.         ...,  0.91957465  0.85609458  0.        ]]
Item Similarity
[[ 0.          0.71131279  0.72435452 ...,  1.          0.94503503  1.        ]
 [ 0.71131279  0.          0.77943001 ...,  1.          0.90392311
   0.90392311]
 [ 0.72435452  0.77943001  0.         ...,  1.          1.          0.88709351]
 ..., 
 [ 1.          1.          1.         ...,  0.          1.          1.        ]
 [ 0.94503503  0.90392311  1.         ...,  1.          0.          1.        ]
 [ 1.          0.90392311  0.88709351 ...,  1.          1.       

In [19]:
item_prediction = training_ratings_matrix.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])
print("Item Prediction")
print(item_prediction)
mean_user_rating = training_ratings_matrix.mean(axis=1)
print("Mean User Rating")
print(mean_user_rating[0:10])
ratings_diff = (training_ratings_matrix - mean_user_rating[:, np.newaxis])
user_prediction = mean_user_rating[:, np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
print("User Prediction")
print(user_prediction)

Item Prediction
[[ 0.35499727  0.36317483  0.3746334  ...,  0.42115224  0.41374968
   0.4057741 ]
 [ 0.07747142  0.08888193  0.08646652 ...,  0.09009255  0.09062327
   0.09129991]
 [ 0.06336656  0.06516192  0.06364823 ...,  0.06021094  0.06450894
   0.06510225]
 ..., 
 [ 0.02399605  0.03187696  0.03097951 ...,  0.03581373  0.03493938
   0.03558608]
 [ 0.13152535  0.14230441  0.1489871  ...,  0.1537318   0.15258944
   0.15373413]
 [ 0.22018025  0.21674162  0.23927344 ...,  0.27650289  0.26735128
   0.26670669]]
Mean User Rating
[ 0.41854935  0.09096314  0.06420927  0.04994055  0.22057075  0.34423306
  0.71403092  0.09155767  0.04340071  0.35196195]
User Prediction
[[ 1.55463588  0.51576974  0.45205378 ...,  0.26857007  0.26877703
   0.26823667]
 [ 1.27721783  0.23230337  0.1343431  ..., -0.07546768 -0.07434415
  -0.07414556]
 [ 1.30867068  0.20405734  0.11136409 ..., -0.10521367 -0.103243
  -0.10310411]
 ..., 
 [ 1.15709858  0.16751898  0.07251936 ..., -0.1297104  -0.12886245
  -0.12868

In [20]:
#Prediction for test dataset ans scaling user rating to five

user_ratings_prediction = user_prediction[testing_ratings_matrix.nonzero()].flatten()
ratings_five = [min(round(i*5), 5) for i in user_ratings_prediction]
user_ratings_prediction = ratings_five
user_testing_ratings_prediction = testing_ratings_matrix[testing_ratings_matrix.nonzero()].flatten()
print("User Ratings Prediction for Test Data Set")
print(user_ratings_prediction[0:10])
print("User Test Data Set")
print(user_testing_ratings_prediction[0:10])

item_ratings_prediction = item_prediction[testing_ratings_matrix.nonzero()].flatten()
ratings_five = [min(round(i*5), 5) for i in item_ratings_prediction]
item_ratings_prediction = ratings_five
item_testing_ratings_prediction = testing_ratings_matrix[testing_ratings_matrix.nonzero()].flatten()
print("Item Ratings Prediction for Test Data Set")
print(item_ratings_prediction[0:10])
print("Item Test Data Set")
print(item_testing_ratings_prediction[0:10])

User Ratings Prediction for Test Data Set
[5, 2.0, 5, 4.0, 3.0, 5.0, 4.0, 5.0, 4.0, 2.0]
User Test Data Set
[ 5.  3.  4.  1.  3.  5.  5.  5.  4.  2.]
Item Ratings Prediction for Test Data Set
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
Item Test Data Set
[ 5.  3.  4.  1.  3.  5.  5.  5.  4.  2.]


In [21]:
#Used Root Mean Squared Error(RMSE)

user_prediction_error_eval = sqrt(mean_squared_error(user_ratings_prediction, user_testing_ratings_prediction))
print('User-based CF RMSE')
print(user_prediction_error_eval)
item_prediction_error_eval = sqrt(mean_squared_error(item_ratings_prediction, item_testing_ratings_prediction))
print('User-based CF RMSE')
print(item_prediction_error_eval)

User-based CF RMSE
1.83518936353
User-based CF RMSE
2.67400822736


In [10]:
sparsity=round(1.0-len(ratings)/float(num_users*num_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%
