In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing Libraries and dependencies**

In [None]:
from bs4 import BeautifulSoup as bs
from bs4 import ResultSet
import re
import requests
import pandas as pd
import json
import numpy as np
import pickle
# !pip install bertopic
from bertopic import BERTopic
from umap import UMAP
import spacy
import nltk
import en_core_web_sm
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Functions to scrape data from the wikipedia page https://en.wikipedia.org/wiki/List_of_years_in_film**

In [None]:
#function to scrape plot
def plot_gen(url):

    response = requests.get(url)
    soup = bs(response.content)
    headline = soup.find('span',{'class':"mw-headline",'id':'Plot'})
    if headline is None:
        return "No Plot Found"
    else:
        parent=headline.parent
        movie_plot=''
        i=0
        while parent.find_next_siblings()[i].name !='h2':
            i+=1
        movie_plot_html=parent.find_next_siblings()[:i]
        plot = bs(str(movie_plot_html), 'html.parser').find_all('p')
        for script in plot:
            movie_plot+= script.get_text(separator=" ",strip=True)

    return movie_plot.strip().replace("\n"," ")

In [None]:
def movies_dict(url):  # this function was actually created later when i discovered the bets movies of all decades webpage. This maps movie title to their plots.
  response = requests.get(url)
  soup = bs(response.content)
  movie_links=soup.select('i a')
  films_link={}
  for movie_link in movie_links:
    films_link[movie_link['title']]=movie_link['href']
  movie_plot={}
  base_url='https://en.wikipedia.org'
  for movie,link in films_link.items():
    movie_plot[movie]=plot_gen(base_url+link)
  movie_df=pd.DataFrame.from_dict(movie_plot.items())
  movie_df.columns=['Title','Plot']
  indices=movie_df[movie_df['Plot']=='No Plot Found'].index
  movie_df.drop(index=indices,inplace=True)
  movie_df.reset_index(drop=True,inplace=True)
  return movie_df

**Functions to retrieve a directors filmography**

In [None]:
#function to get filmogarphy of a director. That is mapping of directors name and their movies.
def filmography_dir(url):

    response = requests.get(url)
    html_content = response.content
    soup = bs(html_content)
    movies = soup.select_one('span#Filmography')
    parent=movies.parent

    films={}
    x=[]
    i=0
    while parent.find_next_siblings()[i].name!='h2' and parent.find_next_siblings()[i].name!='h3':
        i+=1
    html_filmography=parent.find_next_siblings()[:i]
    filmography = bs(str(html_filmography), 'html.parser')

    if len(filmography.select('.wikitable')) > 0:
        tables=(filmography.select_one('.wikitable'))
        titles=tables.select('i a')
        if len(titles)==0:
            tables=bs(str(filmography.select('.wikitable')[:2]))
            titles=tables.select('i a')
        for title in titles:
            films.setdefault(title['title'], title['href'])

    else:
        titles=filmography.select('li i a')
        for title in titles:
            films.setdefault(title['title'], title['href'])


    if len(films)==0:
        i=0
        while parent.find_next_siblings()[i].name!='h2':
            i+=1
        html_filmography=parent.find_next_siblings()[:i]
        filmography = bs(str(html_filmography), 'html.parser')


    if len(filmography.select('.wikitable')) > 0:
        tables=(filmography.select_one('.wikitable'))
        titles=tables.select('i a')
        if len(titles)==0:
            tables=bs(str(filmography.select('.wikitable')[:2]))
            titles=tables.select('i a')
        for title in titles:
            films.setdefault(title['title'], title['href'])

        else:
            titles=filmography.select('li i a')
            for title in titles:
                films.setdefault(title['title'], title['href'])

    if len(films)==0:
        i=0
        while parent.find_next_siblings()[i].name!='h2':
            i+=1
        html_filmography=parent.find_next_siblings()[:i]
        filmography = bs(str(html_filmography), 'html.parser')


        if len(filmography.select('.wikitable.sortable')) > 0:
            tables=(filmography.select_one('.wikitable'))
            titles=tables.select('i a')
            if len(titles)==0:
                tables=bs(str(filmography.select('.wikitable')[:2]))
                titles=tables.select('i a')
            for title in titles:
                films.setdefault(title['title'], title['href'])

        else:
            titles=filmography.select('li i a')
            for title in titles:
                films.setdefault(title['title'], title['href'])


    return films

In [None]:
#function to map plot to movies
def movie_plot_map(url):

    base_url='https://en.wikipedia.org'
    films=filmography_dir(url)
    film_plot={}
    for film,link in films.items():
        film_plot[film]=plot_gen(base_url+link)
    return film_plot

In [None]:
# in this block i am using all the above functions to create a dataframe of directors and their movie plots
def movie_plot_director(directors):
    all_dir=pd.DataFrame()
    for director in directors:
        film_plot=movie_plot_map('https://en.wikipedia.org/wiki/{}'.format(director))
        df=pd.DataFrame(film_plot.items(), columns=['movie', 'plot'])
        df['director'] = director
        all_dir = pd.concat([all_dir, df])
    all_dir.index=np.arange(len(all_dir))
    index=all_dir[(all_dir['plot']=='No Plot Found')].index
    all_dir.drop(index=index,inplace=True)
    return all_dir

**Scraping the wiki page https://en.wikipedia.org/wiki/List_of_years_in_film to get all the movies along with their plot in a dataframe**

In [None]:
movie_df=movies_dict('https://en.wikipedia.org/wiki/List_of_years_in_film')

In [None]:
with open('/content/drive/MyDrive/movie_2000', 'rb') as f:
    movie_df=pickle.load(f)

In [None]:
movie_df

Unnamed: 0,Title,Plot
0,Golmaal 3,"The story follows Pritam, an elderly bus drive..."
1,Bernie (2011 film),"In small-town Carthage, Texas , in 1996, local..."
2,Ride with the Devil (film),Jake Roedel and Jack Bull Chiles are friends i...
3,Exodus: Gods and Kings,"In 1300 BC , Moses , a general and accepted me..."
4,Sleeper (1973 film),Miles Monroe (Woody Allen) is a jazz musician ...
...,...,...
2029,Matilda the Musical (film),"In a hospital, while other parents adore their..."
2030,The Whale (2022 film),"Charlie, a morbidly obese and reclusive Englis..."
2031,Women Talking (film),"A young woman sleeps alone, in bed. There are ..."
2032,"Lyle, Lyle, Crocodile (film)","In New York City , a charismatic magician name..."


In [None]:
with open('/content/drive/MyDrive/all_dir.pkl', 'rb') as f:
    all_dir_loaded = pickle.load(f)
all_dir_loaded

Unnamed: 0,movie,plot,director
0,Following,"A struggling, unemployed young writer (credite...",Christopher Nolan
1,Memento (film),The film starts with a Polaroid photograph of ...,Christopher Nolan
2,Insomnia (2002 film),"In the small fishing town of Nightmute, Alaska...",Christopher Nolan
3,Batman Begins,"In Gotham City , a young Bruce Wayne falls dow...",Christopher Nolan
4,The Prestige (film),"In 1890s London, Robert Angier and Alfred Bord...",Christopher Nolan
...,...,...,...
783,American Graffiti,On their last evening of summer vacation in 19...,George_Lucas
784,Star Wars (film),"Amid a galactic civil war, Rebel Alliance spie...",George_Lucas
785,Star Wars: Episode I – The Phantom Menace,The Trade Federation creates turmoil in the Ga...,George_Lucas
786,Star Wars: Episode II – Attack of the Clones,"Ten years after the battle at Naboo , [a] the ...",George_Lucas


In [None]:
#  new_movies={}
#  movies_dir=set(list(all_dir_loaded['movie']))
#  movies_wiki=set(list(movie_df['Title']))
#  new=list(movies_dir-movies_wiki)
#  new

In [None]:
# for movie in new:
#   new_movies[movie]=all_dir_loaded[all_dir_loaded['movie']==movie]['plot'].values[0]
# new_movie_df=pd.DataFrame(new_movies.items())
# new_movie_df.columns=['Title','Plot']
# movie_df=pd.concat([new_movie_df,movie_df])
# movie_df.drop_duplicates(inplace=True)
# movie_df.reset_index(drop=True,inplace=True)
# movie_df

In [None]:
with open('/content/drive/MyDrive/movie_2000', 'wb') as file:
    pickle.dump(movie_df, file)

In [None]:
plot=movie_df['Plot']
len(plot)

2034

In [None]:
stopwords=stopwords.words('english')


**Data Preprocessing**

In [None]:
def lower_case(row):
  row=row.lower()
  return row

In [None]:
def remove_stopwords(row):
  res=[]
  row=row.split()
  res=[word for word in row if word not in stopwords]
  return ' '.join(res)

In [None]:
def ner(row):
    nlp = en_core_web_sm.load()
    string=[]
    doc=nlp(row)
    for token in doc:
        if token.ent_type_ not in['PERSON','ORG']:
            string.append(str(token))
    return ' '.join((string))

In [None]:
def lemmatize(row):
    doc = nlp(row)
    lemmatized_words = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_words)

In [None]:
#regular expression for cleaning the data
def cleaning(row):

    row = re.sub(r'\([^)]+\)', '', row)
    row = re.sub(r"['’]", '', row)
    row=re.sub(r'[^\w\s]+', ' ', row)
    row=row.replace('\xa0','')
    # row=row.replace('  ',' ')
    row=re.sub(r"\s+", " ", row)
    return row

**Plot after every step of preprocessing**

In [None]:
plot=plot.apply(ner)
plot

0       The story follows Pritam , an elderly bus driv...
1       In small - town Carthage , Texas , in 1996 , l...
2       and are friends in Missouri when the Civil War...
3       In 1300 , Moses , a general and accepted membe...
4       ( ) is a jazz musician and owner of health - f...
                              ...                        
2029    In a hospital , while other parents adore thei...
2030    , a morbidly obese and reclusive English instr...
2031    A young woman sleeps alone , in bed . There ar...
2032    In New York City , a charismatic magician name...
2033    Italian - American brothers and have recently ...
Name: Plot, Length: 2034, dtype: object

In [None]:
plot_ner=plot

In [None]:
with open('/content/drive/MyDrive/plot_ner', 'wb') as file:
    pickle.dump(plot_ner, file)

In [None]:
with open('/content/drive/MyDrive/plot_ner', 'rb') as f:
    plot_ner=pickle.load(f)

In [None]:
plot=plot_ner.apply(lower_case)
plot

0       the story follows pritam , an elderly bus driv...
1       in small - town carthage , texas , in 1996 , l...
2       and are friends in missouri when the civil war...
3       in 1300 , moses , a general and accepted membe...
4       ( ) is a jazz musician and owner of health - f...
                              ...                        
2029    in a hospital , while other parents adore thei...
2030    , a morbidly obese and reclusive english instr...
2031    a young woman sleeps alone , in bed . there ar...
2032    in new york city , a charismatic magician name...
2033    italian - american brothers and have recently ...
Name: Plot, Length: 2034, dtype: object

In [None]:
plot=plot.apply(remove_stopwords)
plot

0       story follows pritam , elderly bus driver . th...
1       small - town carthage , texas , 1996 , local a...
2       friends missouri civil war breaks . american -...
3       1300 , moses , general accepted member egyptia...
4       ( ) jazz musician owner health - food store ne...
                              ...                        
2029    hospital , parents adore newborn children , mr...
2030    , morbidly obese reclusive english instructor ...
2031    young woman sleeps alone , bed . visible bruis...
2032    new york city , charismatic magician named wan...
2033    italian - american brothers recently started p...
Name: Plot, Length: 2034, dtype: object

In [None]:
plot=plot.apply(lemmatize)
plot


0       story follow pritam , elderly bus driver . thr...
1       small - town carthage , texas , 1996 , local a...
2       friend missouri civil war break . american - a...
3       1300 , moses , general accepted member egyptia...
4       ( ) jazz musician owner health - food store ne...
                              ...                        
2029    hospital , parent adore newborn child , mrs . ...
2030    , morbidly obese reclusive english instructor ...
2031    young woman sleep alone , bed . visible bruise...
2032    new york city , charismatic magician name want...
2033    italian - american brother recently start plum...
Name: Plot, Length: 2034, dtype: object

In [None]:
plot=plot.apply(cleaning)
plot

0       story follow pritam elderly bus driver three g...
1       small town carthage texas 1996 local assistant...
2       friend missouri civil war break american assim...
3       1300 moses general accepted member egyptian ro...
4        jazz musician owner health food store new yor...
                              ...                        
2029    hospital parent adore newborn child mrs distra...
2030     morbidly obese reclusive english instructor t...
2031    young woman sleep alone bed visible bruise wou...
2032    new york city charismatic magician name want t...
2033    italian american brother recently start plumb ...
Name: Plot, Length: 2034, dtype: object

In [None]:
stopwords.extend(['rahul','raj','one','two','marcellus','maximus','marina','madhi','jacinto','mr','rahul','raj','nisha','claudia','poelzig','praful','pritam','katrina','katerina','lon','encolpius','singh','leave','take'])

 **BERTopic Model and Hyperparameter Tuning**

**Model-1**

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=2,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
# Clustering model
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=30, min_samples = 20, metric='euclidean', prediction_data=True)

# Initiate BERTopic
topic_model1 = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model,language="english", calculate_probabilities=True)
# Run BERTopic model
topics, probabilities = topic_model1.fit_transform(plot)
topic_model1.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,467,-1_find_leave_kill_take,"[find, leave, kill, take, tell, return, one, g...",[1981 man name recount life story stranger hap...
1,0,958,0_leave_find_tell_take,"[leave, find, tell, take, go, kill, man, retur...",[englishman name travel los angeles investigat...
2,1,136,1_film_wife_love_man,"[film, wife, love, man, one, life, marry, youn...",[playboy meet like like back not tell father r...
3,2,122,2_german_war_soldier_kill,"[german, war, soldier, kill, man, order, leave...",[1983 hector aged world war ii veteran work po...
4,3,84,3_earth_alien_planet_use,"[earth, alien, planet, use, kill, ship, human,...",[prologue business magnate speak newly activat...
5,4,67,4_king_castle_take_prince,"[king, castle, take, prince, return, princess,...",[king roderick tyrant send lord slaughter roya...
6,5,59,5_dog_take_leave_return,"[dog, take, leave, return, home, find, woody, ...",[late 1860 leave family wife teenage son small...
7,6,57,6_police_take_kill_rahul,"[police, take, kill, rahul, brother, find, get...",[madly love nisha beautiful model gets kidnap ...
8,7,46,7_ship_island_sea_crew,"[ship, island, sea, crew, find, captain, boat,...",[one night portsmouth england 1787 press gang ...
9,8,38,8_school_student_friend_party,"[school, student, friend, party, class, make, ...",[start film drive school father drive erratica...




In [None]:
topic_model1.visualize_topics()

In [None]:
topic_model1.visualize_barchart()

In [None]:
topic_model1.visualize_documents(plot)

In [None]:
topic_model1.save('topic_model1')

**Model-2**

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=8,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
# Clustering model
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=40, min_samples = 15, metric='euclidean', prediction_data=True)

# Initiate BERTopic
topic_model2 = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model,language="english", calculate_probabilities=True)
# Run BERTopic model
topics2, probabilities2 = topic_model2.fit_transform(plot)
topic_model2.get_topic_info()

NameError: ignored

In [None]:
topic_model2.visualize_topics()

In [None]:
topic_model2.visualize_barchart(n_words=10)

In [None]:
topic_model2.visualize_documents(tarining_data)

NameError: ignored

In [None]:
topic_model2.save('topic_model2')

**Model-3**

In [None]:
# Clustering model
umap_model = UMAP(n_neighbors=15,
                  n_components=10,
                  min_dist=0.0,
                  metric='euclidean',
                  random_state=100)
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=15)
# Initiate BERTopic
topic_model3 = BERTopic(umap_model=umap_model, hdbscan_model=kmeans_model)
# Run BERTopic model
topics3, probabilities3 = topic_model3.fit_transform(plot)
topic_model3.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,245,0_leave_tell_home_father,"[leave, tell, home, father, go, see, family, m...",[bloom s wedding party 2000 father recall day ...
1,1,213,1_police_kill_man_money,"[police, kill, man, money, take, leave, find, ...",[1980 cuban refugee ex convict arrive miami pa...
2,2,205,2_find_leave_go_kill,"[find, leave, go, kill, tell, man, take, polic...",[divide eight episode prologue epilogue loosel...
3,3,155,3_film_life_wife_love,"[film, life, wife, love, one, man, marry, fath...",[portray womanize city man meet country fight ...
4,4,138,4_school_friend_tell_student,"[school, friend, tell, student, leave, home, h...",[nineteen year old call big boy parent live gr...
5,5,134,5_band_play_show_film,"[band, play, show, film, new, perform, music, ...",[voiceover open film comment whenever get gloo...
6,6,131,6_return_find_take_home,"[return, find, take, home, house, leave, dog, ...",[happily marry couple live new york city inten...
7,7,122,7_kill_man_leave_town,"[kill, man, leave, town, take, return, find, t...",[1868 return eight year absence home brother w...
8,8,120,8_king_castle_moses_prince,"[king, castle, moses, prince, kill, take, retu...",[widow schoolteacher arrive bangkok young son ...
9,9,115,9_german_war_soldier_kill,"[german, war, soldier, kill, man, officer, tak...",[1983 hector aged world war ii veteran work po...


In [None]:
topic_model3.get_topic(1)

[('police', 0.026696272050625552),
 ('kill', 0.0237016887248328),
 ('man', 0.01526703893498546),
 ('money', 0.015059739823951318),
 ('take', 0.01484357065161176),
 ('leave', 0.014458014158273068),
 ('find', 0.01398434916982942),
 ('gang', 0.013095243860730677),
 ('shoot', 0.012901462464123182),
 ('escape', 0.012521926520338369)]

In [None]:
topic_model3.visualize_topics()

In [None]:
topic_model3.visualize_barchart()

In [None]:
topic_model3.visualize_documents(plot)

In [None]:
topic_model3.save('topic_model3')

**Performing Topic modelling on a new film_director filmography**


In [None]:
def analyze_director_filmography(director_name, topic_model, all_dir_df):
    # Check if the director is already present in the DataFrame
    if director_name in all_dir_df['director'].values:
        # Use existing data for the director
        new_plot = all_dir_df[all_dir_df['director'] == director_name]['plot']
    else:
        # Scrape the filmography of the director
        filmography = movie_plot_director([director_name])
        all_dir_df=pd.concat([all_dir_df,filmography])
        new_plot = filmography['plot']

        # Preprocess the plot descriptions
        new_plot = new_plot.apply(ner)
        new_plot = new_plot.apply(remove_stopwords)
        new_plot = new_plot.apply(lemmatize)
        new_plot = new_plot.apply(cleaning)

    # Find similar topics
    num_of_topics = 6
    similar_topics, similarity = topic_model.find_topics(new_plot, top_n=num_of_topics)
    print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')




In [None]:
director_name='Sanjay Leela Bhansali'
topic_model=topic_model2
all_dir_df=all_dir_loaded
analyze_director_filmography(director_name, topic_model, all_dir_df)


The top 6 similar topics are [9, -1, 5, 4, 3, 0], and the similarities are [0.65 0.5  0.49 0.48 0.47 0.47]


In [None]:
all_dir_loaded

Unnamed: 0,movie,plot,director
0,Following,"A struggling, unemployed young writer (credite...",Christopher Nolan
1,Memento (film),The film starts with a Polaroid photograph of ...,Christopher Nolan
2,Insomnia (2002 film),"In the small fishing town of Nightmute, Alaska...",Christopher Nolan
3,Batman Begins,"In Gotham City , a young Bruce Wayne falls dow...",Christopher Nolan
4,The Prestige (film),"In 1890s London, Robert Angier and Alfred Bord...",Christopher Nolan
...,...,...,...
783,American Graffiti,On their last evening of summer vacation in 19...,George_Lucas
784,Star Wars (film),"Amid a galactic civil war, Rebel Alliance spie...",George_Lucas
785,Star Wars: Episode I – The Phantom Menace,The Trade Federation creates turmoil in the Ga...,George_Lucas
786,Star Wars: Episode II – Attack of the Clones,"Ten years after the battle at Naboo , [a] the ...",George_Lucas
