# <u><b> Objective </b></u>
## <b>Build a recommender system which would recommend the 5 most similar movies to a movie query. </b>






Using TMDB 5000 Movie Dataset from Kaggle: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata

In [None]:
# Basic Dependencies
import pandas as pd
import numpy as np
# For basic Ploting graph and charts
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
# For more plotting options
import seaborn as sns
# To just ignore all the warnings.
import warnings
warnings.filterwarnings("ignore")
# For machine learning modeling
import ast
from sklearn.feature_extraction.text import CountVectorizer
# to measure distance between vectors
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import nltk
# nltk.download('all')

In [None]:
# setting up output limit 
pd.set_option('max_rows', None) # Set value to print max 25 rows of dataframe in output
pd.set_option('display.max_columns', None) # set value at 'None' to print unlimited column elements in output

In [None]:
# Loading file from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# File Directory path 
movies = '/content/drive/MyDrive/EDA Projects/Kaggle EDA Projects/Movie Recommender System/tmdb_5000_movies.csv'
credits = '/content/drive/MyDrive/EDA Projects/Kaggle EDA Projects/Movie Recommender System/tmdb_5000_credits.csv'
# Appliance_Energy data file
movies = pd.read_csv(movies)
credits = pd.read_csv(credits)
# Shape of data.
print(f'Movie dataset Diamentions: {movies.shape[0]} x {movies.shape[1]}')
# Shape of data.
print(f'Credit dataset Diamentions: {credits.shape[0]} x {credits.shape[1]}')

Movie dataset Diamentions: 4803 x 20
Credit dataset Diamentions: 4803 x 4


In [None]:
print(movies.info())
print(credits.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [None]:
# merging two main dataframes
movie_df = pd.merge(movies, credits, on ='title')
# Shape of data.
print(f'Diamentions: {movie_df.shape[0]} x {movie_df.shape[1]}')

Diamentions: 4809 x 23


In [None]:
# Drop unneccessory columns from dataframe
movie_df = movie_df[['movie_id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

In [None]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   keywords  4809 non-null   object
 4   genres    4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [None]:
# dropping rows that contains atleast 1 Null value
movie_df = movie_df.dropna()
print(movie_df.shape)

(4806, 7)


In [None]:
# Creating a function to extract genres & keywords name from respective row
def extract_elements(text):
  elements = []
  for i in ast.literal_eval(text):
      elements.append(i['name'])
  return elements 

In [None]:
movie_df['genres'] = movie_df['genres'].apply(extract_elements)
movie_df['keywords'] = movie_df['keywords'].apply(extract_elements)

In [None]:
# Creating a function to extract top 3 actors names from respective row
def extract_top3_elemenets(text):
  elements = []
  counter = 0
  for i in ast.literal_eval(text):
    if counter < 3:
      elements.append(i['name'])
      counter += 1
  return elements 

In [None]:
movie_df['cast'] = movie_df['cast'].apply(extract_top3_elemenets)
movie_df['cast'] = movie_df['cast'].apply(lambda x:x[0:3])

In [None]:
# Creating a function to extract director name from respective row
def fetch_director(text):
  elements = []
  for i in ast.literal_eval(text):
      if i['job'] == 'Director':
          elements.append(i['name'])
  return elements 

In [None]:
movie_df['crew'] = movie_df['crew'].apply(fetch_director)

In [None]:
# Creating a function to remove the white space between the words and make them a single entity
def collapse(text):
  elements = []
  for i in text:
    elements.append(i.replace(" ",""))
  return elements

In [None]:
movie_df.head(3)

Unnamed: 0,movie_id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [None]:
movie_df['cast'] = movie_df['cast'].apply(collapse)
movie_df['crew'] = movie_df['crew'].apply(collapse)
movie_df['genres'] = movie_df['genres'].apply(collapse)
movie_df['keywords'] = movie_df['keywords'].apply(collapse)

In [None]:
movie_df.head(3)

Unnamed: 0,movie_id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[cultureclash, future, spacewar, spacecolony, ...","[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[ocean, drugabuse, exoticisland, eastindiatrad...","[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]


In [None]:
movie_df['overview'] = movie_df['overview'].apply(lambda x:x.split())

In [None]:
movie_df['tags'] = movie_df['overview'] + movie_df['keywords'] + movie_df['genres'] + movie_df['cast'] + movie_df['crew'] 

In [None]:
# dropping below columns as we have on use of them
modified_movie_df = movie_df.drop(columns=['overview','genres','keywords','cast','crew'])

# Joining all the elements in tags column to make a single paragraph
modified_movie_df['tags'] = modified_movie_df['tags'].apply(lambda x: " ".join(x))
modified_movie_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [None]:
modified_movie_df['tags'][2]

'A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to reveal the terrible truth behind SPECTRE. spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom Action Adventure Crime DanielCraig ChristophWaltz LéaSeydoux SamMendes'

***As from above we can see the most of the word have there phural and such similer versions present in the list which can affact the accuracy of the model. As they represet a tag of similar meaning but of different english vocab. Thus, I have planned to use process called Stemming.***

***Stemming is the process of producing morphological variants of a root/base word. Stemming programs are commonly referred to as stemming algorithms or stemmers. A stemming algorithm reduces the words “chocolates”, “chocolatey”, and “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.***

In [None]:
from nltk.stem.porter import PorterStemmer
port_st = PorterStemmer()

In [None]:
def stem_converter(text):
  """ This function splits the paragraph in individual words, then applies PorterStemmer 
  and again joins them back after filteration """
  elements = []
  
  for i in text.split():
    elements.append(port_st.stem(i))

  return " ".join(elements)

In [None]:
modified_movie_df['tags'] = modified_movie_df['tags'].apply(stem_converter)

In [None]:
modified_movie_df['tags'][2]

'a cryptic messag from bond’ past send him on a trail to uncov a sinist organization. while m battl polit forc to keep the secret servic alive, bond peel back the layer of deceit to reveal the terribl truth behind spectre. spi basedonnovel secretag sequel mi6 britishsecretservic unitedkingdom action adventur crime danielcraig christophwaltz léaseydoux sammend'

***Now, we will Convert a collection of text documents to a matrix of token counts and apply this on tags column. In this process, I have also removed stop words like are, is, the, from, etc. to reduce the word count while keeping the meaningful words intact.***

In [None]:
cv = CountVectorizer(max_features = 5000, stop_words='english', lowercase=True)

In [None]:
vector = cv.fit_transform(modified_movie_df['tags']).toarray()

In [None]:
vector.shape

(4806, 5000)

In [None]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
cv.get_feature_names()[500:525]

['biographi',
 'biolog',
 'bird',
 'birth',
 'birthday',
 'bisexu',
 'bishop',
 'bit',
 'bite',
 'bitter',
 'bizarr',
 'black',
 'blackmag',
 'blackmail',
 'blackpeopl',
 'blacksmith',
 'blade',
 'blame',
 'blend',
 'blind',
 'bliss',
 'blizzard',
 'block',
 'blond',
 'blood']

***As we have vectorized the movie tags in 5000 dimensions. To target the similar tags, we have to calculate the distance between tags aka vectors in the 5000-Dimensionsal plane. For that, we will use cosine_similarity library which will Compute cosine similarity between samples in X and Y. Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y.***

***When you hear distance between vector, I remember Euclidean Distance. However, It is only a good choice when it comes to 2-D or 3-D planes. In High Dimensions, Euclidean Distance loses its significance. This phenomena is also knowns as Curse Of Dimensionality.***

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

***The above array represents the distance between 1 tag to all other 5000 tags in the vector plane.***

In [None]:
# fetching the index of a random movie to show how indexing words
modified_movie_df[modified_movie_df['title'] == 'The Lego Movie'].index[0]

744

In [None]:
def recommendations(movie):
  """ This function filter the input movie name to get index within the dataframe. 
      Then, feeds the index to similarity metrix to locate the top 5 movies with closest distance within the plane."""

  index = modified_movie_df[modified_movie_df['title'] == movie].index[0]
  distances = sorted(list(enumerate(similarity[index])), reverse=True, key = lambda x: x[1])
  for i in distances[1:6]:
    print(modified_movie_df.iloc[i[0]].title)

In [None]:
recommendations('Thor')

Thor: The Dark World
Clash of the Titans
After Earth
Iron Man 2
Ant-Man


In [212]:
import pickle

In [213]:
pickle.dump(modified_movie_df.to_dict(), open('movie_dict.pkl','wb')) # exporting dataframe as dictionary
pickle.dump(similarity, open('similarity.pkl','wb'))

In [214]:
from google.colab import files
files.download('movie_dict.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('similarity.pkl')