In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

This Notebook implemented a collaborative based movie recommendation system using users' rating dataset.

#### Load Data

In [2]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#### Data Cleaning

In [3]:
ratings = ratings[['userId','movieId','rating']] # remove timestamp column

In [4]:
# Look for the users who rated more than 50 movies
usercount = ratings[['userId','movieId']].groupby("userId").count()
usercount = usercount[usercount["movieId"] >= 50]
print('user count:', usercount.shape[0])

user count: 427


In [5]:
# Look for the movies reviewed by more than 50 users
moviecount = ratings[['userId','movieId']].groupby("movieId").count()
moviecount = moviecount[moviecount["userId"] >= 50]
print('movie count:', moviecount.shape[0])

movie count: 453


In [6]:
# Keep only the popular movies and active users
ratings = ratings[ratings["userId"].isin(usercount.index) & ratings["movieId"].isin(moviecount.index)]
#print(ratings)

#### Train/Test Split

 20% of each user's ratings will be used for testing, and the remaining will be used for training.

In [7]:
# initilize training and testing dataframes
df_train, df_test = pd.DataFrame(), pd.DataFrame()

In [8]:
# loop each user, random select 80% of ratings for traning
for user in ratings.userId.unique():
    user_df = ratings[ratings['userId']==user].reset_index(drop=True)
    user_train = user_df.sample(frac=0.8, random_state=42)
    user_test = user_df.loc[~user_df.index.isin(user_train.index)]
    
    df_train = pd.concat([df_train, user_train])
    df_test = pd.concat([df_test, user_test])

In [9]:
df_train.reset_index(drop=True)
df_test.reset_index(drop=True)
print(ratings.shape, df_train.shape, df_test.shape)

(38538, 3) (30824, 3) (7714, 3)


In [10]:
# create the rating matrix
user_movie_rating = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
rating_matrix = user_movie_rating.values
print(rating_matrix.shape)

(427, 453)


In [11]:
user_movie_rating.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,59315,60069,63082,68157,68358,68954,70286,72998,74458,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
rating_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 3., 3., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

#### Collaborative Filtering

Collaborative Filtering (CF) is a type of recommendation technique that uses similarities between users/items to infer the possible level of interest of a user to a item unrated by him. Similarities are computed using existing user ratings for items, no metadata needed.

There are two general CF approaches:
- User-based, which exploits similarities between users. A rating prediction of an user to an item is computed using the item ratings given by similar users.
- Item-based, which exploits similarities between items. A rating prediction of an user to an item is computed using ratings of similar items.

In [13]:
def similarity_rating_matrix(rating_matrix, similarity_type, eps=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    
    # if similarity type is user, use rows
    if similarity_type == 'user':
        similarity_matrix = rating_matrix.dot(rating_matrix.T)
    # if similarity type is item, use columns
    elif similarity_type == 'item':
        similarity_matrix = rating_matrix.T.dot(rating_matrix)
    
    norms = np.sqrt(similarity_matrix.diagonal()) + eps
    return similarity_matrix / (norms[np.newaxis, :] * norms[:, np.newaxis]) # cosine-similarity

In [14]:
user_similarity_matrix = similarity_rating_matrix(rating_matrix, similarity_type='user')
item_similarity_matrix = similarity_rating_matrix(rating_matrix, similarity_type='item')

In [15]:
print(f"User similarity matrix shape: {user_similarity_matrix.shape}\nUser similarity matrix sample:\n{user_similarity_matrix[:5, :5]}")
print("-" * 60)
print(f"Item similarity matrix shape: {item_similarity_matrix.shape}\nItem similarity matrix sample:\n{item_similarity_matrix[:5, :5]}")

User similarity matrix shape: (427, 427)
User similarity matrix sample:
[[1.         0.18462176 0.17078823 0.13796451 0.25279235]
 [0.18462176 1.         0.14054924 0.18888497 0.22341257]
 [0.17078823 0.14054924 1.         0.20156541 0.44588482]
 [0.13796451 0.18888497 0.20156541 1.         0.13135761]
 [0.25279235 0.22341257 0.44588482 0.13135761 1.        ]]
------------------------------------------------------------
Item similarity matrix shape: (453, 453)
Item similarity matrix sample:
[[1.         0.43540041 0.28118069 0.26292885 0.38861735]
 [0.43540041 1.         0.25414901 0.30962295 0.24492061]
 [0.28118069 0.25414901 1.         0.34157135 0.22014279]
 [0.26292885 0.30962295 0.34157135 1.         0.2361602 ]
 [0.38861735 0.24492061 0.22014279 0.2361602  1.        ]]


#### Predictions

- For user_based approach, taking normalized weighted sum of all ratings of other users to this item.
- For item_based approach, taking the normalized weighted sum of all other ratings of this user to the other items.

In [16]:
def rating_prediction(ratings, similarity_type, eps=1e-9):
    user_movie_rating = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
    rating_matrix = user_movie_rating.values
    similarity_matrix = similarity_rating_matrix(rating_matrix, similarity_type)
    if similarity_type == 'user':
        predictions = similarity_matrix.dot(rating_matrix) / np.abs(similarity_matrix + eps).sum(axis=0)[:, np.newaxis]
    elif similarity_type == 'item':
        predictions = rating_matrix.dot(similarity_matrix) / np.abs(similarity_matrix + eps).sum(axis=0)[np.newaxis, :]
    return predictions

In [17]:
print("User-based predictions sample:")
print(rating_prediction(ratings, similarity_type='user')[:5, :5])
print("-" * 60)
print("Item-based predictions sample:")
print(rating_prediction(ratings, similarity_type='item')[:5, :5])

User-based predictions sample:
[[1.97209755 1.21996497 0.4045873  0.45427915 1.04410508]
 [2.05445039 0.84196323 0.25730089 0.31319016 0.79463144]
 [2.12839014 0.90713218 0.3573541  0.356598   0.8761276 ]
 [2.20617095 1.00847349 0.40348583 0.43263645 0.82750243]
 [2.19243885 0.96467562 0.3672821  0.36235499 0.88486951]]
------------------------------------------------------------
Item-based predictions sample:
[[0.46797709 0.63625792 0.59477655 0.58064496 0.52803738]
 [0.31446943 0.29049206 0.24255443 0.25298521 0.2821184 ]
 [1.18712592 1.17988761 1.21691211 1.11025686 1.10486582]
 [0.71557491 0.73170828 0.78769375 0.74403359 0.62131598]
 [0.58531624 0.58691275 0.60644504 0.52312613 0.53080547]]


#### Recommendations

In [18]:
user_based_rating = rating_prediction(ratings, similarity_type='user')
item_based_rating = rating_prediction(ratings, similarity_type='item')

In [19]:
def top_n_recommendations(ratings, rating_prediction, user_id, n_movies):
    user_movie_rating = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
    loc = user_movie_rating.index.get_loc(user_id) # get the loc of user id
    ratings = rating_prediction[loc] # get the list of ratings of user id
    idx = sorted(range(len(ratings)), key=lambda i: ratings[i], reverse=True)[:n_movies] # get the loc of top n ratings
    movie_ids = [list(user_movie_rating.columns)[i] for i in idx] # get the movie ids of top n ratings
    return movie_ids # here the output is the movie id

In [20]:
print("Top 5 movie id recommendations for user id = 15")
print("-" * 60)
print("User-based approach recommendations:")
print(top_n_recommendations(ratings, user_based_rating, 15, 5))
print("-" * 60)
print("Item-based approach recommendations:")
print(top_n_recommendations(ratings, item_based_rating, 15, 5))

Top 5 movie id recommendations for user id = 15
------------------------------------------------------------
User-based approach recommendations:
[296, 356, 318, 260, 593]
------------------------------------------------------------
Item-based approach recommendations:
[68157, 70286, 2019, 1244, 3949]


#### Model Evaluation

Here use MSE for model prediction evaluation.

In [21]:
def prediction_df(rating_matrix, prediction_matrix, dataframe):
    preds = []
    for row_id, user_id, movie_id, _ in dataframe.itertuples():
        r_loc = rating_matrix.index.get_loc(user_id) # get the row number in prediction matrix
        c_loc = list(rating_matrix.columns).index(movie_id) # get the col number in prediction matrix
        preds.append(prediction_matrix[r_loc, c_loc]) # get the predict rating
    df_preds = pd.DataFrame(data={"user_id": dataframe.userId, "movie_id": dataframe.movieId, "rating": preds})
    return df_preds

In [22]:
user_based_train_preds = prediction_df(user_movie_rating, user_based_rating, df_train)
user_based_test_preds = prediction_df(user_movie_rating, user_based_rating, df_test)
item_based_train_preds = prediction_df(user_movie_rating, item_based_rating, df_train)
item_based_test_preds = prediction_df(user_movie_rating, item_based_rating, df_test)

In [23]:
user_based_train_mse = mean_squared_error(df_train.rating, user_based_train_preds.rating)
user_based_test_mse = mean_squared_error(df_test.rating, user_based_test_preds.rating)
item_based_train_mse = mean_squared_error(df_train.rating, item_based_train_preds.rating)
item_based_test_mse = mean_squared_error(df_test.rating, item_based_test_preds.rating)

In [24]:
print(f"User-based train MSE: {user_based_train_mse} -- User-based test MSE: {user_based_test_mse}")
print("-" * 80)
print(f"Item-based train MSE: {item_based_train_mse} -- Item-based test MSE: {item_based_test_mse}")

User-based train MSE: 7.134469959470162 -- User-based test MSE: 7.389303345099254
--------------------------------------------------------------------------------
Item-based train MSE: 7.212942319730497 -- Item-based test MSE: 7.28419607599301
