Importing libraries

In [42]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Loading the dataset

In [43]:
movies = pd.read_csv('tmdb_5000_movies.csv')

In [44]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [45]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [46]:
# Updating movies with required (string/ text) features
movies = movies[['id', 'title', 'genres' , 'keywords', 'overview']]

In [47]:
movies.head(2)

Unnamed: 0,id,title,genres,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha..."


Data Pre-processing

In [48]:
#check for null or empty values
(movies.isnull() | movies.eq('')).sum()

Unnamed: 0,0
id,0
title,0
genres,0
keywords,0
overview,3


In [49]:
#Update movies by removing empty and null values
movies = movies[~(movies.isnull() | movies.eq('')).any(axis=1)]
movies = movies.reset_index(drop=True)

In [50]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca..."


In [51]:
#Check for duplicate values
movies.duplicated().sum()

0

Working with genres, keyboard columns

In [52]:
movies['genres']

Unnamed: 0,genres
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
...,...
4795,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam..."
4796,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""..."
4797,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam..."
4798,[]


In [53]:
# Define a function to extract values of name from the dictionary in genres column
import ast
def extract (text):
  list = []
  for dict in ast.literal_eval(text):
      list.append(dict['name'])
  return list

In [54]:
movies['genres'] = movies['genres'].apply(extract)

In [55]:
movies['genres'] = movies['genres'].apply(lambda x: [item.lower() for item in x])


In [56]:
movies['genres']

Unnamed: 0,genres
0,"[action, adventure, fantasy, science fiction]"
1,"[adventure, fantasy, action]"
2,"[action, adventure, crime]"
3,"[action, crime, drama, thriller]"
4,"[action, adventure, science fiction]"
...,...
4795,"[action, crime, thriller]"
4796,"[comedy, romance]"
4797,"[comedy, drama, romance, tv movie]"
4798,[]


In [57]:
# applying extract function on keywords
movies['keywords'] = movies['keywords'].apply(extract)

In [58]:
movies['keywords'] = movies['keywords'].apply(lambda x: [item.lower() for item in x])

In [59]:
movies['keywords']

Unnamed: 0,keywords
0,"[culture clash, future, space war, space colon..."
1,"[ocean, drug abuse, exotic island, east india ..."
2,"[spy, based on novel, secret agent, sequel, mi..."
3,"[dc comics, crime fighter, terrorist, secret i..."
4,"[based on novel, mars, medallion, space travel..."
...,...
4795,"[united states–mexico barrier, legs, arms, pap..."
4796,[]
4797,"[date, love at first sight, narration, investi..."
4798,[]


In [60]:
#removing the space between the items of keywords column to make effective tokens
for index,word in enumerate(movies['keywords']):
  for index in range(len(word)):
    word[index] = word[index].replace(" ","")

In [61]:
movies['keywords']

Unnamed: 0,keywords
0,"[cultureclash, future, spacewar, spacecolony, ..."
1,"[ocean, drugabuse, exoticisland, eastindiatrad..."
2,"[spy, basedonnovel, secretagent, sequel, mi6, ..."
3,"[dccomics, crimefighter, terrorist, secretiden..."
4,"[basedonnovel, mars, medallion, spacetravel, p..."
...,...
4795,"[unitedstates–mexicobarrier, legs, arms, paper..."
4796,[]
4797,"[date, loveatfirstsight, narration, investigat..."
4798,[]


Working with overview columns

In [62]:
movies['overview']

Unnamed: 0,overview
0,"In the 22nd century, a paraplegic Marine is di..."
1,"Captain Barbossa, long believed to be dead, ha..."
2,A cryptic message from Bond’s past sends him o...
3,Following the death of District Attorney Harve...
4,"John Carter is a war-weary, former military ca..."
...,...
4795,El Mariachi just wants to play his guitar and ...
4796,A newlywed couple's honeymoon is upended by th...
4797,"""Signed, Sealed, Delivered"" introduces a dedic..."
4798,When ambitious New York attorney Sam is sent t...


In [63]:
#Defining a function for text cleaning
import re

def clean_text(text):
  # Only process if text is a string
  if isinstance(text, str):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text
  else:
    return ''

In [64]:
movies['overview'] = movies['overview'].apply(clean_text)

In [65]:
movies['overview']

Unnamed: 0,overview
0,in the 22nd century a paraplegic marine is dis...
1,captain barbossa long believed to be dead has ...
2,a cryptic message from bond s past sends him o...
3,following the death of district attorney harve...
4,john carter is a war weary former military cap...
...,...
4795,el mariachi just wants to play his guitar and ...
4796,a newlywed couple s honeymoon is upended by th...
4797,signed sealed delivered introduces a dedicate...
4798,when ambitious new york attorney sam is sent t...


In [66]:
#checking the type
print(type(movies['overview'][0]))
print(type(movies['genres'][0]))
print(type(movies['keywords'][0]))

<class 'str'>
<class 'list'>
<class 'list'>


Tokenization of overview column

In [67]:
#The genres and keywords are already in the form of tokens

In [68]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# 2. Define a function to tokenize and preprocess text
def preprocess_text(text):
    # Tokenization: Split text into individual words
    tokens = word_tokenize(text)

    # Remove stop words (common words like 'the', 'a', 'is')
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [69]:
# 3. Apply the function to the 'overview' column
movies['overview'] = movies['overview'].apply(preprocess_text)
print(movies['overview'])

0       [22nd, century, paraplegic, marine, dispatched...
1       [captain, barbossa, long, believed, dead, come...
2       [cryptic, message, bond, past, sends, trail, u...
3       [following, death, district, attorney, harvey,...
4       [john, carter, war, weary, former, military, c...
                              ...                        
4795    [el, mariachi, wants, play, guitar, carry, fam...
4796    [newlywed, couple, honeymoon, upended, arrival...
4797    [signed, sealed, delivered, introduces, dedica...
4798    [ambitious, new, york, attorney, sam, sent, sh...
4799    [ever, since, second, grade, first, saw, e, ex...
Name: overview, Length: 4800, dtype: object


Creating a new tags column

In [70]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords']
movies_new = movies[['id' ,'title' ,'tags']]

In [71]:
movies_new

Unnamed: 0,id,title,tags
0,19995,Avatar,"[22nd, century, paraplegic, marine, dispatched..."
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believed, dead, come..."
2,206647,Spectre,"[cryptic, message, bond, past, sends, trail, u..."
3,49026,The Dark Knight Rises,"[following, death, district, attorney, harvey,..."
4,49529,John Carter,"[john, carter, war, weary, former, military, c..."
...,...,...,...
4795,9367,El Mariachi,"[el, mariachi, wants, play, guitar, carry, fam..."
4796,72766,Newlyweds,"[newlywed, couple, honeymoon, upended, arrival..."
4797,231617,"Signed, Sealed, Delivered","[signed, sealed, delivered, introduces, dedica..."
4798,126186,Shanghai Calling,"[ambitious, new, york, attorney, sam, sent, sh..."


In [72]:
print(type(movies_new['tags'][0]))

<class 'list'>


Lemmatization

In [73]:
# Lemmatization: Reduce words to their base form (e.g., 'running' to 'run')
lemmatizer = WordNetLemmatizer()
for words in movies_new['tags']:
  for index in range(len(words)):
    words[index] = lemmatizer.lemmatize(words[index])

In [74]:
movies_new

Unnamed: 0,id,title,tags
0,19995,Avatar,"[22nd, century, paraplegic, marine, dispatched..."
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believed, dead, come..."
2,206647,Spectre,"[cryptic, message, bond, past, sends, trail, u..."
3,49026,The Dark Knight Rises,"[following, death, district, attorney, harvey,..."
4,49529,John Carter,"[john, carter, war, weary, former, military, c..."
...,...,...,...
4795,9367,El Mariachi,"[el, mariachi, want, play, guitar, carry, fami..."
4796,72766,Newlyweds,"[newlywed, couple, honeymoon, upended, arrival..."
4797,231617,"Signed, Sealed, Delivered","[signed, sealed, delivered, introduces, dedica..."
4798,126186,Shanghai Calling,"[ambitious, new, york, attorney, sam, sent, sh..."


In [75]:
# Converting tag list to string
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x))

In [76]:
movies['tags']

Unnamed: 0,tags
0,22nd century paraplegic marine dispatched moon...
1,captain barbossa long believed dead come back ...
2,cryptic message bond past sends trail uncover ...
3,following death district attorney harvey dent ...
4,john carter war weary former military captain ...
...,...
4795,el mariachi want play guitar carry family trad...
4796,newlywed couple honeymoon upended arrival resp...
4797,signed sealed delivered introduces dedicated q...
4798,ambitious new york attorney sam sent shanghai ...


Vectorization

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features = 5000)

# Fit and transform the tags column
vectors = count_vectorizer.fit_transform(movies['tags']).toarray()
features = count_vectorizer.get_feature_names_out().reshape(-1,1)

Cosine similarity

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(vectors)

Recommendation Function

In [79]:
# Build the Recommendation Function
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = movies[movies['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    # Skip the first one as it is the movie itself
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return movies['title'].iloc[movie_indices]


In [80]:
# Give desired movie name in get_recommendations('Movie')
recommended_movies = get_recommendations("Avatar")
print("Top 5 similar movies to 'Movie A':")
print(recommended_movies)

Top 5 similar movies to 'Movie A':
539                     Titan A.E.
61               Jupiter Ascending
2994    Mad Max Beyond Thunderdome
507               Independence Day
1191                Small Soldiers
Name: title, dtype: object
