In [47]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings("ignore")

In [10]:
# Reading the datasets
df1 = pd.read_csv('tmdb_5000_movies.csv')
df2 = pd.read_csv('tmdb_5000_credits.csv')

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [12]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [13]:
# Merging the 2 dataframes based on the movie id
movies = pd.merge(df1,df2,left_on = 'id',right_on = 'movie_id',how = 'inner')

In [14]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [15]:
# Drop all columns except id,title,genre,overview,cast,crew
columns = ['id','original_title','genres','overview','crew']
movies = movies[columns]

In [16]:
# Renaming the existing columns
movies.rename(columns = {'id':'Movie_Id','original_title':'Title','genres':'Genres',
                         'overview':'Storyline','crew':'Director'},inplace = True)

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Movie_Id   4803 non-null   int64 
 1   Title      4803 non-null   object
 2   Genres     4803 non-null   object
 3   Storyline  4800 non-null   object
 4   Director   4803 non-null   object
dtypes: int64(1), object(4)
memory usage: 225.1+ KB


In [18]:
movies.head(3)

Unnamed: 0,Movie_Id,Title,Genres,Storyline,Director
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [27]:
# Extracting the list of genres of each movie
def get_genres(obj):
    import ast
    genres = [] # Stores all the genres of a movie
    l = ast.literal_eval(obj) # Converts the string to a list
    for i in l: 
        genres.append(i['name']) # Extracts only the genre from each dictionary
    return genres # Returns a list consisting of all genres in a movie

In [28]:
genre_list = [] # Contains the list of genres for each movie in the dataset
for i in movies['Genres']:
    genre_list.append(get_genres(i))

In [32]:
# Replacing the genre column with the values in genre_list
movies['Genres'] = pd.Series(genre_list)

In [33]:
# Extracting the director of a movie
import ast
def get_director(obj):
    director = [] # Contains the name of the director of the movie
    l = ast.literal_eval(obj) # Converts the string to a list
    for i in l:
        if i['job'] == 'Director':
            director.append(i['name']) # Extracts only the director name from each dictionary
    return director

In [34]:
directors_list = [] # Consists of the list of directors of all the movies
for i in movies['Director']:
    directors_list.append(get_director(i))

In [35]:
# Replacing the director column with the values in directors_list
movies['Director'] = pd.Series(directors_list)

In [36]:
# Determining the unique genres
unique_genres = [] # Contains the unique genres present among all movies
for movie_genre in genre_list:
    for genre in movie_genre:
        if genre not in unique_genres:
            unique_genres.append(genre)

In [37]:
# Determining the unique movies
unique_movies = [] # Contains the unique movies as a list
ug = [] # Contains the genre list for each movie in the unique_movies list

for i in range(len(movies)):
    movie = movies['Title'][i]
    genre = movies['Genres'][i]
    if movie not in unique_movies:
        unique_movies.append(movie)
        ug.append(genre)

In [38]:
# Creating the term frequency matrix using a DataFrame
term_frequency = pd.DataFrame(index = unique_movies,columns = unique_genres) 
term_frequency.fillna(0,inplace = True)
# The term_frequency matrix consists of the list of unique movies(documents) and the genres(words)

In [40]:
term_frequency.head(3)

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
Avatar,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Spectre,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
# Filling the term frequency matrix
count = 0
for movie in unique_movies: # For ex: say Avatar
    for genres in ug[count]: # Will get the genres list of the corresponding movie(here Avatar) 
        term_frequency[genres][movie] = 1 # Update value as 1 in cells where the genres of the movie is present
    count += 1

In [42]:
term_frequency.head(3)

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
Avatar,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Spectre,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
# Calculating the inverse document frequency
n = len(term_frequency) # No of movies
df = list(term_frequency.sum(axis = 0)) # Contains the frequency of each genre over all movies
idf = [] # Contains the list of idf values
for i in df:
    idf.append(np.log(n/i))

In [46]:
# Converting the datatype to float to accomadate normalized values of tf
term_frequency = term_frequency.astype(float)

In [18]:
# Normalizing the term frequency matrix
# We normalize by dividing each cell by the square root of the number of genres present in that movie
import math
count = 0
for movie in unique_movies: # For ex: say Avatar
    for genres in ug[count]: # Will get the genres list of the corresponding movie(here Avatar) 
        term_frequency[genres][movie] = 1/math.sqrt(len(ug[count])) 
        # Dividing each cell by the square root of the number of genres present in that movie
    count += 1

In [19]:
term_frequency.head(3)

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
Avatar,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,0.57735,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Creating the TF - IDF matrix
count = 0
for i in term_frequency.columns:
    term_frequency[i] = term_frequency[i] * idf[count]
    count += 1

In [22]:
term_frequency.head(3)

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
Avatar,0.712795,0.902907,1.213423,1.098092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pirates of the Caribbean: At World's End,0.823065,1.042587,1.40114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spectre,0.823065,1.042587,0.0,0.0,1.115826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Implementing Cosine Similarity
import math
def cosine_similarity(movie1,movie2):
    return np.dot(movie1,movie2)/((math.sqrt(sum(i**2 for i in movie1)))*(math.sqrt(sum(i**2 for i in movie2))))

In [26]:
# Sorting criteria for the movie is based on similarity score
def sorting_criteria(similarity_score):
    return similarity_score[1]

In [43]:
# Recommendation function 
def recommend_similar_to(movie):
    similarity_score = [] # Stores the movie name & similarity score with respect to the movie chosen 
    if movie in unique_movies: # Checks if the movie is available in the dataset
        print("Recommended movies for you: \n")
        for i in unique_movies:
            if i != movie: # If movie is not the same then find similarity score
                similarity_score.append((i,cosine_similarity(term_frequency.loc[movie],term_frequency.loc[i]))) 
                # Calculates similarity score between the movie specified & all the other movies in the dataset
    else:
        print("Movie not in the dataset.\nCannot recommend a similar movie.")
    
    similarity_score = sorted(similarity_score,key = sorting_criteria,reverse = True) # Sorts movies based on high similarity score
    return similarity_score

In [46]:
movie = input("Enter a movie name: ")

Enter a movie name: Avatar


In [47]:
recommended_movies = recommend_similar_to(movie)[:5] # Returns the list top 5 of movies with similarity score
count = 1
for movie in recommended_movies:
    print(count,')',movie[0]) # Prints only the movie name (ignores the similarity score)
    count += 1

Recommended movies for you: 

1 ) Superman Returns
2 ) Man of Steel
3 ) X-Men: Days of Future Past
4 ) Jupiter Ascending
5 ) The Wolverine
