In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('movies_tmdb_popular.csv')
df.head()

Unnamed: 0,title,overview,original_lang,rel_date,popularity,vote_count,vote_average
0,John Wick,Ex-hitman John Wick comes out of retirement to...,en,2014-10-22,818.916,12063,7.2
1,Ad Astra,"The near future, a time when both hope and har...",en,2019-09-17,427.078,2778,6.0
2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2020-01-15,372.268,2062,6.9
3,The Lion King,"Simba idolizes his father, King Mufasa, and ta...",en,2019-07-12,366.904,5043,7.1
4,Jurassic World: Fallen Kingdom,Three years after the demise of Jurassic World...,en,2018-06-06,355.501,7153,6.5


In [3]:
df.describe()

Unnamed: 0,popularity,vote_count,vote_average
count,10000.0,10000.0,10000.0
mean,14.82711,1010.2634,6.32993
std,15.819246,1987.076099,1.285163
min,8.862,0.0,0.0
25%,10.23975,136.0,5.8
50%,11.912,316.0,6.5
75%,15.14025,916.25,7.1
max,818.916,25060.0,10.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          10000 non-null  object 
 1   overview       9979 non-null   object 
 2   original_lang  10000 non-null  object 
 3   rel_date       9993 non-null   object 
 4   popularity     10000 non-null  float64
 5   vote_count     10000 non-null  int64  
 6   vote_average   10000 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 547.0+ KB


In [5]:
C = df.vote_average.mean()
print(C)

6.329930000000007


In [6]:
m = df.vote_count.quantile(0.90)
print(m)

2533.1000000000004


In [7]:
df_copy = df.copy().loc[df['vote_count'] >= m]
df_copy.shape

(1001, 7)

In [8]:
df.shape

(10000, 7)

In [9]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
df_copy['score'] = df_copy.apply(weighted_rating, axis=1)

In [11]:
#Sort movies based on score calculated above
df_copy = df_copy.sort_values('score', ascending=False)

In [12]:
#Print the top 20 movies
df_copy[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
88,The Shawshank Redemption,15423,8.7,8.36565
153,The Godfather,11715,8.7,8.278637
155,Pulp Fiction,17926,8.5,8.231317
148,The Dark Knight,21489,8.4,8.181714
210,Fight Club,18601,8.4,8.151885
247,Forrest Gump,17282,8.4,8.135369
135,Inception,25060,8.3,8.119144
189,Schindler's List,9325,8.6,8.115073
78,The Lord of the Rings: The Return of the King,14940,8.4,8.099899
102,Interstellar,21493,8.3,8.092293


In [13]:
df.overview.head(5)

0    Ex-hitman John Wick comes out of retirement to...
1    The near future, a time when both hope and har...
2    Marcus and Mike are forced to confront new thr...
3    Simba idolizes his father, King Mufasa, and ta...
4    Three years after the demise of Jurassic World...
Name: overview, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(10000, 28972)

In [15]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
cosine_sim.shape

(10000, 10000)

In [17]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [18]:
indices[:10]

title
John Wick                                                               0
Ad Astra                                                                1
Bad Boys for Life                                                       2
The Lion King                                                           3
Jurassic World: Fallen Kingdom                                          4
Bloodshot                                                               5
Motherless Brooklyn                                                     6
Transformers: The Last Knight                                           7
Birds of Prey (and the Fantabulous Emancipation of One Harley Quinn)    8
Barbie and the Diamond Castle                                           9
dtype: int64

In [21]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [22]:
get_recommendations('Avengers: Endgame')

57                 Avengers: Infinity War
5701    Next Avengers: Heroes of Tomorrow
131            Captain America: Civil War
92                         Captain Marvel
1212                    Super Mario Bros.
8733      My Little Pony: Equestria Girls
8538                              Haunter
485                  Thor: The Dark World
4270            The NeverEnding Story III
8120            Bill & Ted Face the Music
Name: title, dtype: object

In [24]:
get_recommendations('Forrest Gump')

8059                           Ca$h
1732          Mr. Peabody & Sherman
9191                The Borderlands
739             Catch Me If You Can
3433              It Might Get Loud
8203                  Another Earth
7228                  The Wrong Man
139                            Coco
9019                      The Alibi
6047    Love in the Time of Cholera
Name: title, dtype: object