In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
rating = pd.read_csv('ratings_small.csv')
movie_names = pd.read_csv('movies_metadata.csv')
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [18]:
movie_names = movie_names[['id','title']]

In [19]:
movie_names = movie_names.rename({'id':'movieId'},axis=1)

In [20]:
movie_names['movieId'] = pd.to_numeric(movie_names['movieId'])

In [21]:
movie_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45463 non-null  float64
 1   title    45460 non-null  object 
dtypes: float64(1), object(1)
memory usage: 532.9+ KB


In [22]:
movie_rat = pd.merge(rating,movie_names,on='movieId')
movie_rat.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III


In [23]:
movie_rat = movie_rat[['userId','movieId','rating','title']]

In [24]:
movie_rat.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1371,2.5,Rocky III
1,4,1371,4.0,Rocky III
2,7,1371,3.0,Rocky III
3,19,1371,4.0,Rocky III
4,21,1371,3.0,Rocky III


In [34]:
n_movies = movie_rat['movieId'].nunique()
n_users = movie_rat['userId'].nunique()
print(n_movies,n_users)

2830 671


In [39]:
train_movie_mat = np.zeros((n_users,n_movies))
test_movie_mat = np.zeros((n_users,n_movies))
user_movie_mat.shape

(671, 2830)

In [40]:
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(movie_rat,train_size=0.75)

In [45]:
for row in train_data.itertuples():
    train_movie_mat[row[1]-1,row[2-1]] = row[3]
    
for row in test_data.itertuples():
    test_movie_mat[row[1]-1,row[2-1]] = row[3]
    
train_movie_mat

array([[0. , 2.5, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 3. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [44]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_movie_mat,metric='cosine')
item_similarity = pairwise_distances(train_movie_mat.T,metric='cosine')

In [55]:
def pred(ratings,similarity,typ='user'):
    if typ=='user':
        mean_rating = ratings.mean(axis=1)
        rating_diff = ratings-mean_rating[:,np.newaxis]
        prediction = mean_rating[:,np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        
    elif typ=='item':
        prediction = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        
    return prediction

In [65]:
user_pred = pred(train_movie_mat,user_similarity,'user')
item_pred = pred(train_movie_mat,item_similarity,'item')

In [82]:
from math import sqrt
from sklearn.metrics import mean_squared_error

def err(predict_rating,actual_rating):
    pred_rat = predict_rating[actual_rating.nonzero()].flatten()
    actual_rat = actual_rating[actual_rating.nonzero()].flatten()
    error = sqrt(mean_squared_error(pred_rat,actual_rat))
    return error

In [83]:
user_pred.shape

(671, 2830)

In [84]:
test_movie_mat.shape

(671, 2830)

In [85]:
user_rat_err = err(user_pred,test_movie_mat)
item_rat_err = err(item_pred,test_movie_mat)

In [88]:
print('user based CF error: ' + str(user_rat_err))
print('item based CF error: ' + str(item_rat_err))

user based CF error: 3.816022636578966
item based CF error: 3.8160473260390964
