In [75]:
# Importing the necessary libraries

# Selenium is used for automating web browser interaction
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

# BeautifulSoup for extracting data
import bs4
from bs4 import BeautifulSoup

# Used for making HTTP request to web page
import requests

# time is used to add delays
import time

import pandas as pd #importing pandas

In [77]:
print("pandas version:", pd.__version__)
print("selenium version:", selenium.__version__)
print("requests version:", requests.__version__)
print("beautifulsoup4 version:", bs4.__version__)

pandas version: 2.2.2
selenium version: 4.31.0
requests version: 2.32.3
beautifulsoup4 version: 4.12.3


In [3]:
# Launching new Chrome Browser
driver = webdriver.Chrome()

In [5]:
# Navigating to the IMDb page that lists Nepali films
driver.get('https://www.imdb.com/search/title/?primary_language=ne&title_type=feature')

#Maximizing the browser window
driver.maximize_window()

In [7]:
for i in range(1):  # put the range value based on the number of pages in imdb
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")     # Scroll to the bottom of the page
    time.sleep(2)
    driver.execute_script("window.scrollBy(0,-300);")    # Scroll slightly up to make sure the "Load More" button is in view
    time.sleep(1)
    try:
        # click the "Load More" button using its XPath
        driver.find_element(By.XPATH, value='//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button').click()
        time.sleep(3)
    except:
        break                #If the button is not found break the loop

In [9]:
html = driver.page_source   #Getting the full HTML source code

In [11]:
soup = BeautifulSoup(html,'lxml')          # Parsing the HTML content with BeautifulSoup using the 'lxml' parser for easy extraction of data

In [13]:
# Using BeautifulSoup to find all list items of class 'ipc-metadata-list-summary-item'
list_items = soup.find_all('li',class_="ipc-metadata-list-summary-item")

In [15]:
href = []
for i in list_items:
    title = i.find('a',class_="ipc-title-link-wrapper")    # Finding the anchor ('a') tag that contains the movie title link, using its class name

    href.append(title.get('href'))         # Appending the urls to the href list

In [17]:
# Initializing empty lists to store specific movie information

Director = []
Stars = []
Writer = []
Writers = []
Directors = []
Release_date = []
Title = []
Summaries = []
Star = []
Genre = []
Image = []
list1 = {
    'Director': Director,
    'Directors': Directors,
    'Writer' : Writer,
    'Writers' : Writers,
    'Stars' : Stars,
    'Star': Star,
    'Release_date': Release_date,
    'Summaries': Summaries,
    'Genre': Genre,
    'Image':Image
}

In [21]:
for i in href:
    # Defining a header to mimic a real browser request
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'}
    
    url = 'https://www.imdb.com/' + i
    request = requests.get(url,headers = headers)
    soup = BeautifulSoup(request.text,'lxml')
                         
     # Trying to find the unordered list that contains metadata like directors, writers, stars, etc.
    unorder = soup.find('ul',class_="ipc-metadata-list ipc-metadata-list--dividers-all title-pc-list ipc-metadata-list--baseAlt")

    # Check if the 'unorder' list exists and has at least 3 items(Director, Writers and Stars)
    if unorder is not None and len(unorder)==3:
        for i in unorder:
            try:

                # Extracting the role (e.g., Director, Writer, Star) from the metadata list
                role = i.find('span',class_="ipc-metadata-list-item__label ipc-metadata-list-item__label--btn ipc-btn--not-interactable").text
                if role in list1.keys():    
                    names = []

                    # Extracting the names associated with the role (Directors, Writers, Stars, etc.)
                    name = i.find_all('a',class_ = "ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
                    for i in name:
                        names.append(i.text)

                 # Appending the extracted names to the appropriate list based on the role
                    if role == 'Director' or role == 'Directors':
                        Director.append(names)
                    elif role == 'Writer' or role == 'Writers':
                        Writer.append(names)
                    elif role == 'Stars' or role == 'Star':
                        Stars.append(names)
                    
            except Exception as e:
                print(f'{url} has Director or Writer issue {e}')       # Handle any issues encountered during director or writer extraction
            try:

                # Extracting additional roles (e.g., Star, Writer) if available
                role = i.find('a',class_='ipc-metadata-list-item__label ipc-metadata-list-item__label--link').text    
                if role in list1.keys():
                    names = []
                    name = i.find_all('a',class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
                    for i in name:
                        names.append(i.text)
                    if role == 'Stars' or role == 'Star':
                        Stars.append(names)
                    elif role == 'Writer' or role == 'Writers':
                        Writer.append(names)
            except Exception as e:
                print(f'{url} has Stars issue {e}')
    else:
        print('Less than 3')
        continue          # If less than 3 items found, skip to the next URL
    try:
        
        # Extracting movie title and release date from the page
        title_date = soup.find('div', class_="sc-70a366cc-0 bxYZmb")
        title = title_date.find('span', class_="hero__primary-text").text
        Title.append(title)
        date = title_date.find('a',class_="ipc-link ipc-link--baseAlt ipc-link--inherit-color").text
        Release_date.append(date)
    except Exception as e:

        # If there is an error extracting the title or release date, append 'N/A'
        Title.append('N/A')
        Release_date.append('N/A')
        print(f"{url} has Date or Title issue {e}")
    try:

         # Extracting movie genres
        genre = soup.find_all('span',class_="ipc-chip__text")
        list2 = []
        for i in genre:
            list2.append(i.text)
        Genre.append(list2[:-1])    # Removing the last element (not a genre)
    except Exception as e:

         # If there is an error extracting the title or release date, append 'N/A'
        Genre.append('N/A')
        print(f"{url} has Genre issue {e}")
    try:

         # Extracting movie image 
        image = soup.find('img', "ipc-image" )['src']
        Image.append(image)
    except Exception as e:
        Image.append('N/A')
        print(f'{url} has Image issue {e}')
    try:

        # Trying to fetch the movie plot summary by navigating to the 'plotsummary' page
        url2 = 'plotsummary/'
        url3 = url.replace('/?ref','/plotsummary/?ref')
        request = requests.get(url3,headers = headers)
        soup = BeautifulSoup(request.text,'lxml')
        summary = soup.find('div', class_="ipc-html-content-inner-div").text
        Summaries.append(summary)
    except Exception as e:
        Summaries.append('N/A')
        print(f'{url} has Summary issue {e}')

https://www.imdb.com//title/tt28495418/?ref_=sr_t_1 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt28495418/?ref_=sr_t_1 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt28495418/?ref_=sr_t_1 has Director or Writer issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt31495279/?ref_=sr_t_2 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt31495279/?ref_=sr_t_2 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt31495279/?ref_=sr_t_2 has Director or Writer issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt32332133/?ref_=sr_t_3 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt32332133/?ref_=sr_t_3 has Stars issue 'NoneType' object has no attribute 'text'
https://www.imdb.com//title/tt32332133/?ref_=sr_t_3 has Director or Writer issue 'NoneType' ob

In [23]:
Title = [item for item in Title if item != 'N/A']   # Filtering out 'N/A' values from the 'Title' list to keep only valid movie titles
len(Title)    #Checking the length of the 'Title' list

90

In [25]:
len(Image)   #Checking the length of the 'Image' list

90

In [27]:
len(Stars)     #Checking the length of the 'Stars' list

90

In [29]:
len(Writer)      #Checking the length of the 'Writer' list

90

In [31]:
len(Director)     #Checking the length of the 'Director' list

90

In [33]:
len(Release_date)   #Checking the length of the 'Release_date' list

90

In [35]:
len(Summaries)      #Checking the length of the 'Summaries' list

90

In [37]:
len(Genre)            #Checking the length of the 'Genre' list

90

In [39]:
# Creating a Pandas DataFrame from the extracted data
df = pd.DataFrame({
    'Title':Title,
    'Director':Director,
    'Writer':Writer,
    'Stars':Stars,
    'Genre':Genre,
    'Release_Date':Release_date,
    'Overview':Summaries,
    'Image':Image
})

In [41]:
df.head()

Unnamed: 0,Title,Director,Writer,Stars,Genre,Release_Date,Overview,Image
0,The Red Suitcase,[Fidel Devkota],[Fidel Devkota],"[Saugat Malla, Prabin Khatiwada, Shristi Shres...","[Drama, Mystery]",2023,A pick-up truck driver leaves Kathmandu airpor...,https://m.media-amazon.com/images/M/MV5BYmY0Zj...
1,Pitaambar,[Krisha Chaulagain],"[Pradeep Bhardwaj, Abhimanyu Nirabi]","[Dayahang Rai, Pradeep Khadka, Supushpa Bhatta]",[Action],2025,Pitambar is a story of a fearless and tactful ...,https://m.media-amazon.com/images/M/MV5BNmYxMD...
2,Boksi Ko Ghar,[Sulakshyan Bharati],[Sulakshyan Bharati],"[Keki Adhikari, Shupala Sapkota, Swechchha Raut]",[Thriller],2024,The plot follows a journalist investigating a ...,https://m.media-amazon.com/images/M/MV5BZWY4ZD...
3,Sherpa,[Jennifer Peedom],[Jennifer Peedom],"[Russell Brice, Tim Medvetz, Pasang Tenzing Sh...",[Documentary],2015,"In 2014, director Jennifer Peedom was working ...",https://m.media-amazon.com/images/M/MV5BMjAwOD...
4,Purna Bahadur Ko Sarangi,[Saroj Poudel],"[Mahesh Dawadi, Saroj Poudel]","[Bijay Baral, Prakash Saput, Swayam KC]",[Drama],2024,"A struggling single father battles poverty, us...",https://m.media-amazon.com/images/M/MV5BNzY4ND...


In [43]:
def clean_column(column):

 # If the column is a list, join the elements with a comma to create a single string,Otherwise, return the column as it is
    return ', '.join(column) if isinstance(column, list) else column

In [45]:

# Applying the clean_column function to 'Director', 'Writer', 'Stars' ,'Genre' columns
df['Director'] = df['Director'].apply(clean_column)
df['Writer'] = df['Writer'].apply(clean_column)
df['Stars'] = df['Stars'].apply(clean_column)
df['Genre'] = df['Genre'].apply(clean_column)

In [47]:
df.head()   # Displaying 1st 5 rows

Unnamed: 0,Title,Director,Writer,Stars,Genre,Release_Date,Overview,Image
0,The Red Suitcase,Fidel Devkota,Fidel Devkota,"Saugat Malla, Prabin Khatiwada, Shristi Shrestha","Drama, Mystery",2023,A pick-up truck driver leaves Kathmandu airpor...,https://m.media-amazon.com/images/M/MV5BYmY0Zj...
1,Pitaambar,Krisha Chaulagain,"Pradeep Bhardwaj, Abhimanyu Nirabi","Dayahang Rai, Pradeep Khadka, Supushpa Bhatta",Action,2025,Pitambar is a story of a fearless and tactful ...,https://m.media-amazon.com/images/M/MV5BNmYxMD...
2,Boksi Ko Ghar,Sulakshyan Bharati,Sulakshyan Bharati,"Keki Adhikari, Shupala Sapkota, Swechchha Raut",Thriller,2024,The plot follows a journalist investigating a ...,https://m.media-amazon.com/images/M/MV5BZWY4ZD...
3,Sherpa,Jennifer Peedom,Jennifer Peedom,"Russell Brice, Tim Medvetz, Pasang Tenzing Sherpa",Documentary,2015,"In 2014, director Jennifer Peedom was working ...",https://m.media-amazon.com/images/M/MV5BMjAwOD...
4,Purna Bahadur Ko Sarangi,Saroj Poudel,"Mahesh Dawadi, Saroj Poudel","Bijay Baral, Prakash Saput, Swayam KC",Drama,2024,"A struggling single father battles poverty, us...",https://m.media-amazon.com/images/M/MV5BNzY4ND...


In [49]:
df.duplicated().sum()  # check for duplicate data 

0

In [51]:
df.drop_duplicates(inplace=True) # Removing the duplicates data

In [53]:
df.shape # checking the shape of the dataset

(90, 8)

In [57]:
df.to_csv('Nepali_movie.csv')  #converting to csv file named 'Nepali_movie.csv'