To Build a Movie recommender using 100k Dataset from 100 users on 1700 movies

In [1]:
#Necessary Imports for further use 

import pandas as pd
import numpy as np
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
#Reading data file u.data
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,
 encoding='latin-1')


num_users = ratings.user_id.unique().shape[0]
num_items = ratings.movie_id.unique().shape[0]
print(num_users) 
print(num_items)

943
1682


There are 943 users and 1632 movies in the dataset

Splitting the data into training and testing 

In [3]:
#Spliting the data into training and testing
training_ratings_data, testing_ratings_data = cv.train_test_split(ratings, test_size=0.25)

print("Training Ratings Data - 75%")
print(training_ratings_data.shape)
print(training_ratings_data.head())
print("Testing Ratings Data - 25%")
print(testing_ratings_data.shape)
print(testing_ratings_data.head())

Training Ratings Data - 75%
(75000, 4)
       user_id  movie_id  rating  unix_timestamp
25674      468       258       4       875279126
199        130       216       4       875216545
12929       85        94       3       882995966
45829      378       918       3       892383162
19920      234       589       3       892078567
Testing Ratings Data - 25%
(25000, 4)
       user_id  movie_id  rating  unix_timestamp
7126       233       286       3       876690514
74049      937       224       4       876769480
64692      788        38       3       880871359
33467      200       501       4       884129504
65749      902       295       2       879465128


# Creating a User-Item Matrix


The training set contains 943 users and 1682 movies. We are now creating the test_matrix and the train_matrix in which the number of rows is equal to the number of unique users and the number of columns is equal to the number of unique movies. The matrix cells are filled with the corresponding rating the user has given to the movie. The matrix cell has the value 0 if the user has not rated the movie.

In [4]:
#Building a USer-Item Matrix

training_ratings_matrix = np.zeros((num_users, num_items))

for row in training_ratings_data.itertuples():
    training_ratings_matrix[row[1]-1, row[2]-1] = row[3]

testing_ratings_matrix = np.zeros((num_users, num_items))

for row in testing_ratings_data.itertuples():
    testing_ratings_matrix[row[1]-1, row[2]-1] = row[3]

#print("User-Item Matrix")
#print(training_ratings_matrix)

# User-Item based collabrative filter

We create a similarity matrix which specifies the similarity between two users and items based on the ratings they have given to different movies.

In [5]:
#Using Cosine similarity 

user_similarity = pairwise_distances(training_ratings_matrix, metric='cosine')
item_similarity = pairwise_distances(training_ratings_matrix.T, metric='cosine')
print("User Similarity")
print(user_similarity.shape)
print(user_similarity)
print("")
print("Item Similarity")
print(item_similarity.shape)
print(item_similarity)

User Similarity
(943, 943)
[[ 0.          0.86008979  0.94443082 ...,  0.8719025   0.84370138
   0.67046989]
 [ 0.86008979  0.          0.86751336 ...,  0.90024625  0.89801998
   0.90851575]
 [ 0.94443082  0.86751336  0.         ...,  0.95111763  0.8321209
   0.98524725]
 ..., 
 [ 0.8719025   0.90024625  0.95111763 ...,  0.          0.9383455
   0.92500046]
 [ 0.84370138  0.89801998  0.8321209  ...,  0.9383455   0.          0.83529012]
 [ 0.67046989  0.90851575  0.98524725 ...,  0.92500046  0.83529012  0.        ]]

Item Similarity
(1682, 1682)
[[ 0.          0.68703644  0.75843361 ...,  1.          0.94550631  1.        ]
 [ 0.68703644  0.          0.79736039 ...,  1.          1.          0.90883943]
 [ 0.75843361  0.79736039  0.         ...,  1.          1.          1.        ]
 ..., 
 [ 1.          1.          1.         ...,  0.          1.          1.        ]
 [ 0.94550631  1.          1.         ...,  1.          0.          1.        ]
 [ 1.          0.90883943  1.         ...,


The shape of similarity user-item matrix is 943 x 943 and 1682 x 1682 with each cell corresponding to the similarity between a user and item. Now we will use a prediction function that will predict the values in the user-item matrix. We will only consider the top n users which are similar to a user that are similar to a user to make predictions for that user. In the formula we normalise the ratings of users by subtracting the mean rating of a user from every rating given by the user. In the same way we will consider the item similarity and follow the above steps.

In [6]:
item_prediction = training_ratings_matrix.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])
print("Item Prediction")
print(item_prediction)
mean_user_rating = training_ratings_matrix.mean(axis=1)
print("Mean User Rating")
print(mean_user_rating[0:10])
ratings_diff = (training_ratings_matrix - mean_user_rating[:, np.newaxis])
user_prediction = mean_user_rating[:, np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
print("User Prediction")
print(user_prediction)

Item Prediction
[[ 0.33250492  0.34864853  0.35860168 ...,  0.40333135  0.39885405
   0.39582887]
 [ 0.08777485  0.09976398  0.09545823 ...,  0.10113028  0.10096916
   0.10136909]
 [ 0.06527194  0.06835039  0.06639059 ...,  0.06781678  0.06843155
   0.06879325]
 ..., 
 [ 0.02820858  0.0350054   0.0347065  ...,  0.03985723  0.03932109
   0.03909386]
 [ 0.13751286  0.14779831  0.15586676 ...,  0.16299822  0.16171268
   0.16279745]
 [ 0.21833392  0.21747084  0.23164857 ...,  0.26710291  0.2605822
   0.26061165]]
Mean User Rating
[ 0.40309156  0.10107015  0.06777646  0.04340071  0.22770511  0.33709869
  0.70154578  0.10404281  0.03686088  0.35077289]
User Prediction
[[ 1.55008668  0.53270301  0.42686604 ...,  0.25030149  0.25293445
   0.25287333]
 [ 1.33266229  0.27647717  0.13336343 ..., -0.06727641 -0.0640784
  -0.06387835]
 [ 1.33869556  0.2402373   0.10545868 ..., -0.1023436  -0.09896578
  -0.09885783]
 ..., 
 [ 1.19672861  0.19944682  0.06936336 ..., -0.1265913  -0.12343978
  -0.12349

In [7]:
#Prediction for test dataset ans scaling user rating to five

user_ratings_prediction = user_prediction[testing_ratings_matrix.nonzero()].flatten()
ratings_five = [min(round(i*5), 5) for i in user_ratings_prediction]
user_ratings_prediction = ratings_five
user_testing_ratings_prediction = testing_ratings_matrix[testing_ratings_matrix.nonzero()].flatten()
print("User Ratings Prediction for Test Data Set")
print(user_ratings_prediction[0:10])
print("User Test Data Set")
print(user_testing_ratings_prediction[0:10])

item_ratings_prediction = item_prediction[testing_ratings_matrix.nonzero()].flatten()
ratings_five = [min(round(i*5), 5) for i in item_ratings_prediction]
item_ratings_prediction = ratings_five
item_testing_ratings_prediction = testing_ratings_matrix[testing_ratings_matrix.nonzero()].flatten()
print("Item Ratings Prediction for Test Data Set")
print(item_ratings_prediction[0:10])
print("Item Test Data Set")
print(item_testing_ratings_prediction[0:10])

User Ratings Prediction for Test Data Set
[2.0, 4.0, 4.0, 5, 2.0, 2.0, 4.0, 2.0, 2.0, 1.0]
User Test Data Set
[ 5.  2.  5.  5.  5.  4.  4.  3.  3.  2.]
Item Ratings Prediction for Test Data Set
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
Item Test Data Set
[ 5.  2.  5.  5.  5.  4.  4.  3.  3.  2.]


# Root Mean Squared Error 

Using the root mean squared error to calculate the accuracy of the predicted ratings.

In [8]:
#Used Root Mean Squared Error(RMSE)

user_prediction_error_eval = sqrt(mean_squared_error(user_ratings_prediction, user_testing_ratings_prediction))
print('User-based CF RMSE')
print(user_prediction_error_eval)
item_prediction_error_eval = sqrt(mean_squared_error(item_ratings_prediction, item_testing_ratings_prediction))
print('Item-based CF RMSE')
print(item_prediction_error_eval)

User-based CF RMSE
1.83198253267
Item-based CF RMSE
2.68016417408


In [9]:
sparsity=round(1.0-len(ratings)/float(num_users*num_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


# Getting recommendation for the user

getting movie recommendation for the user with user_id 25 user-item collaborative filter

In [10]:
user_id = 25
user_ratings_prediction = item_prediction[user_id-1,:]
train_unkown_indices = np.where(training_ratings_matrix[user_id-1,:] == 0)[0]
train_unkown_indices

array([   1,    2,    3, ..., 1679, 1680, 1681])

In [11]:
user_recommendations = user_ratings_prediction[train_unkown_indices]
print('\n Recommendation for the user {} are the movies:\n'.format(user_id))
for movie_id in user_recommendations.argsort()[-5:][: : -1]:
    print(movie_id +1)


 Recommendation for the user 25 are the movies:

1325
1322
1195
1308
1528


# Singular value Decomposition

SVD is a model-based method. It is a mathematical technique to find the missing values in a matrix. It decomposes matrix into three matrices two of which are rectangular and the middle one is a diagonal matrix.

In [12]:
import scipy.sparse as sp
import math
from scipy.sparse.linalg import svds
u, s, vt = svds(training_ratings_matrix, k = 20)
u.shape, s.shape, vt.shape

((943, 20), (20,), (20, 1682))

In [13]:
s_diag_matrix = np.diag(s)
predictions_svd = np.dot(np.dot(u,s_diag_matrix),vt)
predictions_svd.shape

(943, 1682)

In [14]:
predicted_ratings_svd = predictions_svd[testing_ratings_matrix.nonzero()]
test_truth = testing_ratings_matrix[testing_ratings_matrix.nonzero()]
math.sqrt(mean_squared_error(predicted_ratings_svd,test_truth))


2.7175485558019754

RMSE for the SVD is calculated and further on we find the recommendation for the user with ID 25

In [20]:
user_id = 25
user_ratings = predictions_svd[user_id-1,:]
train_unkown_indices = np.where(training_ratings_matrix[user_id-1,:] == 0)[0]
user_recommendations = user_ratings[train_unkown_indices]
user_recommendations.shape

(1620,)

In [18]:
print('\nRecommendations for user {} are the movies: \n'.format(user_id))
for movie_id in user_recommendations.argsort()[-5:][: : -1]:
    print(movie_id +1)


Recommendations for user 25 are the movies: 

91
44
149
168
549
