In [1]:
import pandas as pd

# loading datasets
df_cred = pd.read_csv("tmdb_5000_credits.csv")
df_mov = pd.read_csv("tmdb_5000_movies.csv")

# check the size of the datasets
df_cred.shape, df_mov.shape

((4803, 4), (4803, 20))

In [2]:
'''
to verify the consistency and synchronocity of values before merging the 2 
Dataframes together.
'''
df_cred = df_cred.iloc[:2500,:]
df_mov = df_mov.iloc[:2500, :]
(df_cred.movie_id != df_mov.id).any().sum()

0

In [3]:
# merging and renaming the columns as follows
# rename column name
df_cred.rename(columns = {'movie_id':'id'}, inplace = True)

# merge the 2 dataframes and store in df
df = df_cred.merge(df_mov, on = 'id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    2500 non-null   int64  
 1   title_x               2500 non-null   object 
 2   cast                  2500 non-null   object 
 3   crew                  2500 non-null   object 
 4   budget                2500 non-null   int64  
 5   genres                2500 non-null   object 
 6   homepage              1027 non-null   object 
 7   keywords              2500 non-null   object 
 8   original_language     2500 non-null   object 
 9   original_title        2500 non-null   object 
 10  overview              2500 non-null   object 
 11  popularity            2500 non-null   float64
 12  production_companies  2500 non-null   object 
 13  production_countries  2500 non-null   object 
 14  release_date          2500 non-null   object 
 15  revenue              

In [4]:
''' 
Data CLeaning: 
generating a comprehensive corpus of keywords that encapsulate the essence of
each movie. it is done by eliminating unwanted and null columns
'''
# drop null overviews
df.dropna(subset = ['overview'], inplace = True)

# filter out the target columns
df = df[['id', 'title_x', 'genres', 'overview', 'cast', 'crew']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2500 non-null   int64 
 1   title_x   2500 non-null   object
 2   genres    2500 non-null   object
 3   overview  2500 non-null   object
 4   cast      2500 non-null   object
 5   crew      2500 non-null   object
dtypes: int64(1), object(5)
memory usage: 117.3+ KB


In [5]:
# analysing the data
''' 
Observations:
1. The overview and title columns contain simple string values, which makes them straightforward to handle.
2. the genres, cast, and crew columns follow a similar structure, as they consist of lists of dictionaries (approach by taking top 3 cast members)
'''
df

Unnamed: 0,id,title_x,genres,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...
2495,68727,Trance,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 80, ""n...","A fine art auctioneer mixed up with a gang, jo...","[{""cast_id"": 1, ""character"": ""Simon"", ""credit_...","[{""credit_id"": ""52fe47a1c3a368484e0d195f"", ""de..."
2496,12657,Soul Plane,"[{""id"": 10749, ""name"": ""Romance""}, {""id"": 35, ...",Following a ridiculously awful flight that lea...,"[{""cast_id"": 1, ""character"": ""Mr. Hunkee"", ""cr...","[{""credit_id"": ""52fe45109251416c7504a7c3"", ""de..."
2497,8265,Welcome to the Sticks,"[{""id"": 35, ""name"": ""Comedy""}]",Although living a comfortable life in Salon-de...,"[{""cast_id"": 2, ""character"": ""Philippe Abrams""...","[{""credit_id"": ""52fe4499c3a36847f809f3ef"", ""de..."
2498,12410,Good,"[{""id"": 18, ""name"": ""Drama""}]",The rise of national socialism in Germany shou...,"[{""cast_id"": 2, ""character"": ""Halder"", ""credit...","[{""credit_id"": ""52fe44dd9251416c75043a39"", ""de..."


In [6]:
# genres
print(df.genres[0])
print(' '.join([i['name'] for i in eval(df.genres[0])]))
# taking top 3 cast
print(' '.join([i['name'] for i in eval(df.cast[0])[:3]]))
# taking crew (director & producer)
print(' '.join(list(set([i['name'] for i in eval(df.crew[0]) if i['job']=='Director' or i['job']=='Producer']))))

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
Action Adventure Fantasy Science Fiction
Sam Worthington Zoe Saldana Sigourney Weaver
James Cameron Jon Landau


In [7]:

# functions to generate corpus
def generate_corpus(overview, genre, cast, crew):
    corpus = ""
    genre = ' '.join([i['name'] for i in eval(genre)])
    cast = ' '.join([i['name'] for i in eval(cast)[:5]])
    crew = ' '.join(list(set([i['name'] for i in eval(crew) if i['job']=='Director' or i['job']=='Producer'])))
    corpus += overview+" "+genre+" "+cast+" "+crew
    return corpus

corpus = []
for i in range(len(df)):
    corpus.append(generate_corpus(df.iloc[i].overview, df.iloc[i].genres, df.iloc[i].cast, df.iloc[i].crew))

print(len(corpus))
corpus[0]

2500


'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez James Cameron Jon Landau'

In [8]:
# rename the column
df.rename(columns = {'title_x':'title'}, inplace = True)
# drop old columns
df.drop(columns = ['genres', 'overview', 'cast', 'crew'], inplace = True)
# add corpus
df['corpus'] = corpus
df

Unnamed: 0,id,title,corpus
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...
2495,68727,Trance,"A fine art auctioneer mixed up with a gang, jo..."
2496,12657,Soul Plane,Following a ridiculously awful flight that lea...
2497,8265,Welcome to the Sticks,Although living a comfortable life in Salon-de...
2498,12410,Good,The rise of national socialism in Germany shou...


In [9]:
# Text Representation and text similarity
''' 
convert textual information into a mathematical form by representing them as vectors and then decide the method for comparing similarities or differences
'''
''' 
utilizing Bag of Words (BoW)
utilizing TF-IDF for finding relative importance of words in document in order to reduce the size of the array by converting similar words as one ex eat, eating, eats as eat generating a feature matrix
'''
# using the cosine similarity for quantifying the similarity between vector representations

from sklearn.feature_extraction.text import TfidfVectorizer

# initialize object and remove stopwords
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(df['corpus'])

# compare shapes
print(df.shape)
print(tfidf_matrix.shape)

(2500, 3)
(2500, 20406)


In [10]:
# generating cosine-similarity matrix via linear kernel
from sklearn.metrics.pairwise import linear_kernel

# compute the similarity matrix
cos_mat = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cos_mat.shape)

''' 
The diagonal elements of the matrix should be 1 since each movie is being compared to itself. To verify this, if we sum up all the diagonal elements of the matrix, it should yield a value of 4800.
'''
diag = 0
for i in range(len(cos_mat)):
    diag += cos_mat[i][i]
print(diag)

(2500, 2500)
2500.0


In [11]:
''' 
When the user provides a movie name, the model will locate the corresponding index of the movie in our DataFrame. We can then use this index to retrieve the same index from the similarity matrix. As the DataFrame and the cosine matrix are aligned, this step yields an array containing the cosine similarity scores of that movie with all other movies in the database.

However, the array is not sorted in any particular order, and we want to showcase the most similar movies. To achieve this, we need to sort the array in descending order. The first element will always correspond to the movie itself, with a similarity score of 1, followed by the other movies in descending order of similarity. Here lies a challenge: Sorting the array will disrupt the original order, making it difficult to fetch the movie titles from the database.

To overcome this, we can store the movie index and similarity score as tuples. Then, we can perform the sorting based on the score alone while keeping the index intact. Subsequently, we can retrieve the movie details using the index from the tuple, ensuring we maintain the correct movie-title association. This approach allows us to obtain the desired similarity rankings while preserving the necessary information for fetching movie details. We can also pass a parameter n for slicing i.e. to fetch top n similar movies.
'''
def get_rec(movie, n):
    # get index from dataframe
    index = df[df['title'] == movie].index[0]
    # sort top n similar movies
    sim_movies = sorted(list(enumerate(cos_mat[index])), reverse=True, key=lambda x: x[1])
    # extracting names form dataframe and return the movie names
    rec = []
    for i in sim_movies[1: n+1]:
        rec.append(df.iloc[i[0]].title)
    return rec

# testing the function
print(get_rec("The Dark Knight", 3))
print(get_rec("Mission: Impossible", 3))

['The Dark Knight Rises', 'Batman Begins', 'Batman Returns']
['Mission: Impossible III', 'Mission: Impossible II', 'Mission: Impossible - Ghost Protocol']


In [12]:
''' 
overcomming the following limitation: if a movie name is not present in the DataFrame, or what if a user wants to get recommendations based on cast and crews?
solution: Although the cast and crew details are not directly included in our DataFrame, we can still make use of them. Here’s where TF-IDF comes to the rescue. By applying TF-IDF transformation to the keywords or tags, we can convert them into vectors of the same length as our cosine matrix.
'''
from sklearn.metrics.pairwise import cosine_similarity
def get_keywords_rec(keywords, n):
    keywords = keywords.split()
    keywords = " ".join(keywords)
    # transform the string to vector representation
    key_tfidf = tfidf.transform([keywords])
    # compute the cosine-similarity
    result = cosine_similarity(key_tfidf, tfidf_matrix)
    # sort the top n similar movies
    sim_key_movies = sorted(list(enumerate(result[0])), reverse=True, key=lambda x: x[1])
    # extract names from the dataframe and return the movie name
    rec = []
    for i in sim_key_movies[1: n+1]:
        rec.append(df.iloc[i[0]].title)
    return rec

# testing
get_keywords_rec("Christopher Nolan", 4)

['Insomnia', 'Man of Steel', 'Batman Begins', 'Interstellar']

In [13]:
# utilizing the similarity matrix and DataFrame to save it as binary files using joblib
import joblib
joblib.dump(df, '../models/movie_db.df')
joblib.dump(cos_mat, '../models/cos_mat.mt')
joblib.dump(tfidf, '../models/vectorizer.tf')
joblib.dump(tfidf_matrix, '../models/tfidf_mat.tf')

['../models/tfidf_mat.tf']