In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
import numpy as np

In [2]:
# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

In [3]:
metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [4]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


# Simple Recommenders

In [6]:
# Calculate the mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


# The average rating of a movie on IMDB is around 5.6 on a scale of 10.

# Find the value below which 90% of the vote counts fall.

In [7]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [8]:
metadata['vote_count'].value_counts()

vote_count
1.0       3264
2.0       3132
0.0       2899
3.0       2787
4.0       2480
          ... 
2755.0       1
1187.0       1
4200.0       1
3322.0       1
2712.0       1
Name: count, Length: 1820, dtype: int64

In [9]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [10]:
q_movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45177,False,"{'id': 442352, 'name': 'Brice Collection', 'po...",0,"[{'id': 35, 'name': 'Comedy'}]",,375798,tt5029602,fr,Brice 3,"Brice is back. The world has changed, but not ...",...,2016-10-19,0.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Brice 3,False,4.3,160.0
45204,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,417870,tt3564472,en,Girls Trip,Four girlfriends take a trip to New Orleans fo...,...,2017-07-21,0.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"""Forgive us in advance for this wild weekend""",Girls Trip,False,7.1,393.0
45258,False,"{'id': 466463, 'name': 'Descendants Collection...",0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 107...",,417320,tt5117876,en,Descendants 2,When the pressure to be royal becomes too much...,...,2017-07-21,0.0,111.0,"[{'iso_639_1': 'da', 'name': 'Dansk'}]",Released,Long live evil.,Descendants 2,False,7.5,171.0
45265,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,2014-08-15,1359497.0,118.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,,Force Majeure,False,6.8,255.0


In [11]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)


In [12]:
# Define a new feature 'score' and calculate its value with `weighted_rating()` function 
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [13]:
q_movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.640253
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,6.820293
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,5.660700
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,7.537201
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0,5.556626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45177,False,"{'id': 442352, 'name': 'Brice Collection', 'po...",0,"[{'id': 35, 'name': 'Comedy'}]",,375798,tt5029602,fr,Brice 3,"Brice is back. The world has changed, but not ...",...,0.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Brice 3,False,4.3,160.0,4.959104
45204,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,417870,tt3564472,en,Girls Trip,Four girlfriends take a trip to New Orleans fo...,...,0.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"""Forgive us in advance for this wild weekend""",Girls Trip,False,7.1,393.0,6.671272
45258,False,"{'id': 466463, 'name': 'Descendants Collection...",0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 107...",,417320,tt5117876,en,Descendants 2,When the pressure to be royal becomes too much...,...,0.0,111.0,"[{'iso_639_1': 'da', 'name': 'Dansk'}]",Released,Long live evil.,Descendants 2,False,7.5,171.0,6.590372
45265,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,1359497.0,118.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,,Force Majeure,False,6.8,255.0,6.344369


# The source value takes into account the number of voters and the average votes. For the movie

In [14]:
#Sort movies based on a score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

In [15]:
#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Here we can only recommend these movies which got more sources to all the users

# Content-Based Recommender

In [16]:
#  build a system that recommends movies that are similar to a particular movie.

In [17]:
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

# We want to calculate the similarity between texts, so we need to convert them into vectors and calculate the cosine similarity

In [18]:
tfidf = TfidfVectorizer(stop_words='english') # Remove all english stop words such as 'the', 'a'

In [19]:
#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

In [20]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

In [21]:
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

In [22]:
tfidf.get_feature_names_out()[5000:5010]

array(['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon',
       'avant', 'avanthika', 'avanti', 'avaracious'], dtype=object)

In [23]:
# use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# each movie overview cosine similarity score with every other movie overview

In [24]:
cosine_sim.shape

(45466, 45466)

In [25]:
len(cosine_sim[1])

45466

In [26]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [27]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45466, dtype: int64

In [30]:
metadata['title']

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
                    ...             
45461                         Subdue
45462            Century of Birthing
45463                       Betrayal
45464               Satan Triumphant
45465                       Queerama
Name: title, Length: 45466, dtype: object

In [28]:
# Function that takes in movie title as input and outputs most similar movies to it 
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


In [29]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [31]:
get_recommendations('Toy Story')

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object

In [32]:
get_recommendations('Jumanji')

21633         Table No. 21
45253                 Quiz
41573         Snowed Under
35509             The Mend
44376    Liar Game: Reborn
17223       The Dark Angel
8801               Quintet
6166             Brainscan
30981         Turkey Shoot
9503             Word Wars
Name: title, dtype: object

# The quality of your recommender would be increased with the usage of better metadata and by capturing more of the finer details.

In [64]:
"""
" We can build a collaborative filtering recommender system by having data about users and finding similar users who have similar interests "
Conclusion

After finishing this tutorial, I have learned that:
-The three types of recommender systems are:
Collaborative Filtering
Content-Based Filtering
simpler recommender systems

-How to build a simpler recommender system and content-based recommender system:

This system returns the top movies based on specific computed scores and recommends them to users.
The content-based recommender returns many movies that are similar to a specific movie.

-We must convert the text into vectors to compute the similarity between them.
"""

'\nConclusion\n\nAfter finishing this tutorial, I have learned that:\n-The three types of recommender systems are:\nCollaborative Filtering\nContent-Based Filtering\nsimpler recommender systems\n\n-How to build a simpler recommender system and content-based recommender system:\n\nThe simple recommender returns the top number of movies based on specific computed scores for all users.\nThe content-based recommender returns many movies that are similar to a specific movie.\n\n-We must convert the text into vectors to compute the similarity between them.\n'