# Collabortive Filtering

In [1]:
# Imports required packages
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances

## The Data Set
The latest small version of data set MovieLens (see https://grouplens.org/datasets/movielens/) containing ratings for ~9000 movies by ~600 users is used for ﬁnding similar users based on common movies the users have watched and how they have rated those movies.

In [2]:
# Reads the ratings from file
ratings = pd.read_csv("./../../../Data/ml-latest-small/ratings.csv")

# Displays few of the ratings
display(ratings.head(10))

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [3]:
# Checks for data set related information
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


In [4]:
# Gets unique number of users and movies

print("Unique Users Count: {}\n\nUnique Movies Count: {}".format(
    ratings.userId.nunique(),
    ratings.movieId.nunique()))

Unique Users Count: 610

Unique Movies Count: 9724


In [5]:
# Removes timestamp field as it is not considered in this case

ratings.drop(["timestamp"], axis = 1, inplace = True)

## User-based Similarity

### Data Transformations

In [6]:
# Transforms data into matrix where each user and movie
# is represented as row and column, respectively.

users_movies_pivot = ratings.pivot(
    index="userId", columns="movieId", values="rating")

In [7]:
# Displays the first few records
display(users_movies_pivot.head(10))

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Fill missing values with zero (0)
users_movies_pivot.fillna(0, inplace = True)

In [9]:
# Computing similarity distance between users

user_similarity = pd.DataFrame(
    1 - pairwise_distances(users_movies_pivot, metric="cosine")
)

# Sets the index and column names with user ids
user_similarity.index = users_movies_pivot.index
user_similarity.columns = users_movies_pivot.index

In [10]:
# Displays first few records from user_similarity
display(user_similarity.head(10))

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
6,0.128152,0.025333,0.003936,0.088491,0.300349,1.0,0.075843,0.370488,0.013904,0.020385,...,0.021415,0.448927,0.098002,0.396582,0.104541,0.102123,0.162182,0.178809,0.214234,0.052668
7,0.158744,0.027585,0.0,0.11512,0.108342,0.075843,1.0,0.114885,0.099463,0.132099,...,0.206405,0.125182,0.103664,0.062025,0.219586,0.200035,0.186114,0.323541,0.09084,0.193219
8,0.136968,0.027257,0.004941,0.062969,0.429075,0.370488,0.114885,1.0,0.0,0.023914,...,0.080203,0.550552,0.101126,0.324495,0.120425,0.099388,0.185142,0.187233,0.423993,0.078153
9,0.064263,0.0,0.0,0.011361,0.0,0.013904,0.099463,0.0,1.0,0.040959,...,0.081138,0.009257,0.044268,0.018969,0.09679,0.075898,0.011844,0.100435,0.0,0.074399
10,0.016875,0.067445,0.0,0.031163,0.030611,0.020385,0.132099,0.023914,0.040959,1.0,...,0.187179,0.013318,0.025047,0.011598,0.10413,0.088963,0.010451,0.077424,0.021766,0.121072


In [11]:
# Checks for the shape of the simlarity matrix
# The shape of both dimension should match with count for unique users
print(user_similarity.shape)

(610, 610)


In [12]:
# The above matrix has diagonal with all 1's as a user is fully similar to himself/herself.
# These needs to be made to 0 to find user who is similar to a specific user
# Note that the changes in the diagonal are in-place

np.fill_diagonal(user_similarity.values, 0)

### Finding Similarity between Users

In [13]:
# Prepares the most similar user for each user
most_similar_user = user_similarity.idxmax(axis = 1)

# Prints most similar user for few of the users
print(most_similar_user)

userId
1      266
2      366
3      313
4      391
5      470
      ... 
606    474
607    570
608    480
609    340
610    249
Length: 610, dtype: int64


To justify the above computed similarity, movie data set is read below to verify.

In [14]:
# Reads the movies from filke
movies = pd.read_csv("./../../../Data/ml-latest-small/movies.csv")

In [15]:
# Displays few of the records
display(movies.head(10))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [16]:
# Column "genres" is removed for not being considered in this case
movies.drop(["genres"], axis = 1, inplace = True)

In [17]:
def get_common_movies(user1, user2, min_rating):
    """
    Finds movies watched by both users
    
    Parameters
    ----------
    user1: int
        Id of the first user
    
    user2: int
        Id of the second user
        
    min_rating: float
        Minimum rating that both user1 and user2 have provided to each of the movies that both watched
        
    Returns
    -------
    DataFrame
        Movies that both user1 and user2 watched and rated with or more than the specified 
    
    """
    
    common_movies = ratings[ratings.userId == user1].merge(
        ratings[ratings.userId == user2], on = "movieId", how = "inner" ).merge(
            movies, on = 'movieId')
    
    # ﬁlters out movies that both the users have rated at least 4 to limit the number of movies to print
    return common_movies[
        (common_movies.rating_x >= min_rating) & ((common_movies.rating_y >= min_rating))]

In [18]:
# Just as an example, it just shows similarity between two randomly 
# selected users (user Id 6 and user Id 117) and it returns 
# movies (with titles) that both have rated with 4.0 or more

get_common_movies(6, 117, 4)

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
4,6,7,4.0,117,4.0,Sabrina (1995)
6,6,11,4.0,117,4.0,"American President, The (1995)"
10,6,26,4.0,117,4.0,Othello (1995)
15,6,41,4.0,117,4.0,Richard III (1995)
16,6,47,4.0,117,4.0,Seven (a.k.a. Se7en) (1995)
19,6,62,4.0,117,4.0,Mr. Holland's Opus (1995)
21,6,110,5.0,117,5.0,Braveheart (1995)
22,6,141,4.0,117,4.0,"Birdcage, The (1996)"
23,6,150,4.0,117,4.0,Apollo 13 (1995)
40,6,252,4.0,117,4.0,I.Q. (1994)


## Item-based Similarity 

### Data Transformation

In [19]:
# Transforms data into matrix where each movie and user
# is represented as row and column, respectively.

movies_users_pivot = ratings.pivot(
    index="movieId", columns="userId", values="rating")

In [20]:
# Displays the first few records
display(movies_users_pivot.head(10))

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
6,4.0,,,,,4.0,,,,,...,,3.0,4.0,3.0,,,,,,5.0
7,,,,,,4.0,,,,,...,,,,,,2.5,,,,
8,,,,,,3.0,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,3.0,,2.0,,,...,,3.0,,,,,,4.0,4.0,


In [21]:
movies_users_pivot.shape

(9724, 610)

In [22]:
# Fill missing values with zero (0)
movies_users_pivot.fillna(0, inplace = True)

In [23]:
# Computing similarity distance between movies

movie_similarity = pd.DataFrame(
    1 - pairwise_distances(movies_users_pivot, metric="cosine")
)

# Sets the index and column names with movie ids
movie_similarity.index = movies_users_pivot.index
movie_similarity.columns = movies_users_pivot.index

In [24]:
# Displays first few records from movie_similarity
display(movie_similarity.head(10))

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.376316,0.297009,0.284257,0.089685,0.298969,1.0,0.244105,0.147562,0.214088,0.386414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.277491,0.228576,0.402831,0.275035,0.474002,0.244105,1.0,0.273757,0.162,0.238949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.131629,0.172498,0.313434,0.158022,0.283523,0.147562,0.273757,1.0,0.0,0.189867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.232586,0.044835,0.30484,0.0,0.335058,0.214088,0.162,0.0,1.0,0.048611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.395573,0.417693,0.242954,0.095598,0.218061,0.386414,0.238949,0.189867,0.048611,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072542


In [25]:
# Checks for the shape of the simlarity matrix
# The shape of both dimension should match with count for unique users
print(movie_similarity.shape)

(9724, 9724)


In [26]:
# The above matrix has diagonal with all 1's as a movie is fully similar to itself.
# These needs to be made to 0 to find user who is similar to a specific user
# Note that the changes in the diagonal are in-place

np.fill_diagonal(movie_similarity.values, 0)

In [27]:
# Displays first few records from user_similarity
display(movie_similarity.head(10))

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,0.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,0.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,0.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,0.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.376316,0.297009,0.284257,0.089685,0.298969,0.0,0.244105,0.147562,0.214088,0.386414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.277491,0.228576,0.402831,0.275035,0.474002,0.244105,0.0,0.273757,0.162,0.238949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.131629,0.172498,0.313434,0.158022,0.283523,0.147562,0.273757,0.0,0.0,0.189867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.232586,0.044835,0.30484,0.0,0.335058,0.214088,0.162,0.0,0.0,0.048611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.395573,0.417693,0.242954,0.095598,0.218061,0.386414,0.238949,0.189867,0.048611,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072542


### Finding Similarity between Movies

In [28]:
def get_similar_movies(movieId, topN = 5):
    """
    Gets movies similar to the specified one based on similar rating they have received from users
    
    Parameters
    ----------
    movieId: int
        Id of the movie similar movies to be searched for
        
    topN: int, default: 5
        Maximum number of similar movies to return
        
    Returns
    -------
    DataFrame:
        A DataFrame containing most similar movies with title, Id and similarity score (between 0 and 1)
    
    """
    
    # Prepares list (series) of specified number movies in descending order of similarity
    similar_movies = movie_similarity[movieId].sort_values(ascending=False).head(topN)
    
    # Sets name of the series for later use for sorting
    similar_movies.name = "similarity"
    
    # Joins the movie DataFrame for movie title to be returned
    return movies.merge(
        similar_movies,
        how="inner",
        on="movieId").sort_values(["similarity"], ascending=False)
    

### Receiving Movie Recommendations

In [29]:
# Recommends top 5 similar movies if someone has already watched 
# movie "Godfather, The (1972)". Id of the movie is 858.

display(
    get_similar_movies(movieId = 858, topN = 5)
)

Unnamed: 0,movieId,title,similarity
4,1221,"Godfather: Part II, The (1974)",0.821773
3,1213,Goodfellas (1990),0.664841
2,1193,One Flew Over the Cuckoo's Nest (1975),0.620536
0,260,Star Wars: Episode IV - A New Hope (1977),0.595317
1,608,Fargo (1996),0.588614


In [30]:
# Recommends top 5 similar movies if someone has already watched 
# movie "Dumb & Dumber (Dumb and Dumber) (1994)". Id of the movie is 231.

display(
    get_similar_movies(movieId = 231, topN = 5)
)

Unnamed: 0,movieId,title,similarity
2,344,Ace Ventura: Pet Detective (1994),0.670194
1,165,Die Hard: With a Vengeance (1995),0.575156
3,380,True Lies (1994),0.550168
0,153,Batman Forever (1995),0.544826
4,588,Aladdin (1992),0.539576
