# Ugwumsi Egbuna
## Data Scientist
## IMDb Web Scraper
## March 11, 2023.

# This script scrapes data on movies from January 2000 to September 2022
## It captures the following data points
1. Movie Title
2. Release Date
3. Rating
4. Total Number of votes in the rating
5. Award Nomination Wins
6. Country of Origin
7. Directors


Import Statements

In [1]:
import os
import csv
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

Call Web Driver

In [2]:
#Set up driver to automatically load chrome browser
my_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|█████████████████| 8.84M/8.84M [00:01<00:00, 8.37MB/s]


Navigate to the search results page on the IMDb website.

In [3]:
#Get search result page
feature_films = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2022-09-14&count=100'

#Open search result page
my_driver.get(feature_films)

## The following steps were taken in this scraping exercise;
1. Install and setup a chrome WebDriver, the lauch the chrome browser window.
2. Navigate to the search results page.
3. Extracts the movie titles and Movie Links on each page.
4. Check if there are no more movies on the page and we need to go the next page to cotinue scraping.
5. Loop through the url list from the movies gotten in 4 up.
6. Navigate to the url in the browser, and get all the relevant details from there such as Directors, Award Wins, Ratings etc.
7. Add them to their respective lists.
8. Create a csv file in th epresent working directory and write all the data to the file.


In [4]:

def scrape_data():
    '''Scrapes all the data on any search result page (waits 3 seconds for page to load ). It gets all HTML (parent) elements with the 'h3' tag 
       and stores it in a variable 'header_', then a for-loop that gets the child element with tag 
       name 'a', and appends the text of the result to a list, 'title'. 
       The same is done for a second list 'title_element' which contains the 'href' attribute (link) to every title.
       
       Returns a dictionary with the 'Title' and the 'links'
    ''' 
    
    my_path = '/Users/macbookpro2018/data_science/web_scraping/IMDb'
    my_file = 'imdb_feature_films.csv'
    
    

    #Open search result page
    page_one = my_driver.get(feature_films)
    
    
    #get all header elements
    #header_ = my_driver.find_elements(By.TAG_NAME,'h3')
    
    #Initialize empty lists for title and title links
    global title_
    global title_element
    global release_date
    global rating
    global total_votes
    global award_nom_wins
    global country_of_origin
    global directors
    global df
    df = pd.DataFrame()
    title_ = []
    title_element = []
    release_date = []
    rating = []
    total_votes = []
    award_nom_wins = []
    country_of_origin = []
    directors = []
    
    #Checks if the movie index on the results page. 
    #This solves the solves for th eerror produced when the script goes to the next page and the movie index is 101 instead of 1.
    def check_remainder(movie_index): 
        the_remainder = movie_index%100 #Checks the remainder after dividing the movie index by 100
        if movie_index == 0:
            return 0 #Returns 0 if the movie index is 0
        elif the_remainder == 0:
            return 0 #Returns 0 if there is no remainder after dividing the movie index by 100
        else:
            return the_remainder #If there iis a remainder, retruns the remainder.
        
    #Get movie elemnts with tag_name 'a'
    #loops through all elements in 'header_' and gets the titles
    
    start = 0 #Initialize this for the purpose of having a counter.
    
    for i in range(300): #Get only movie data from the first 300 movies on the search results page.
        #my_driver.get(feature_films)
        header = my_driver.find_elements(By.CSS_SELECTOR,'h3.lister-item-header')[check_remainder(i)]
        if start == 300:
            break
        else:
            pass
        
        
        #Checks if we are at the end of the page and need to click the next button or not.
        if header.find_element(By.CSS_SELECTOR,'span').text.strip('.') == my_driver.find_element(By.XPATH,'//*[@id="main"]/div/div[4]/span[1]').text.split(' ')[0].split('-')[1]:
            
            if my_driver.find_element(By.CSS_SELECTOR,'div.desc > a.lister-page-next.next-page').text not in my_driver.find_element(By.CSS_SELECTOR,'div.desc').text:
                break
                
            else: 
                page_one = my_driver.find_element(By.PARTIAL_LINK_TEXT,'Next')
                page_one.click()
                time.sleep(4)
                #header = my_driver.find_elements(By.CSS_SELECTOR,'h3.lister-item-header > a')
                #title_.append(i.find_element(By.TAG_NAME,'a').text)
                #title_element.append(i.find_element(By.TAG_NAME,'a').get_attribute('href'))
        else:
            
            title_.append(header.find_element(By.CSS_SELECTOR,'a').text) #appends the title to the global list above
            title_element.append(header.find_element(By.CSS_SELECTOR,'a').get_attribute('href')) #appends the movie link to the global list above.
            #print(title_,title_element)
        
        start += 1 #Incrementor for the counter

    #Try to read the csv file to confirm if the file already exists in the folder.
    try:
        file_df = pd.read_csv(os.path.join(my_path,my_file))
    #Create the file if it doesn't exist.    
    except FileNotFoundError:
        my_columns = ['Title','Link','Release Date','Rating','Total Votes','Awards Wins & Nominations','Country of Origin','Director']
        file = open(os.path.join(my_path,my_file), 'a')
        writer_object = csv.writer(file)
        writer_object.writerow(my_columns)
            
        file.close() #close file after creating ad writing to it.
    
    for url in title_element: #Loop through the list of movie links collected
        
        file_df = pd.read_csv(os.path.join(my_path,my_file)) #Reads th csv file into a Pandas DataFrame
        if url in file_df['Link']: #Checks if the movie url already exists in the 'Links' column
            pass
        else: #If the moie url does not exist in the csv file as per the check above
        
            #feature_films = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2022-09-14&count=100'
            #webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            my_driver.get(url) #avigate to the movie url in the browser

            #Get release date for the movie
            release = my_driver.find_elements(By.CSS_SELECTOR,'div.sc-f65f65be-0.ktSkVi ul.ipc-inline-list.ipc-inline-list--show-dividers.ipc-inline-list--inline.ipc-metadata-list-item__list-content.base a')

            #Get the ratings
            rates = my_driver.find_element(By.CSS_SELECTOR, 'div.ipc-button__text > div.sc-f6306ea-3.loTxjn')

            #Get award wins and Nominations
            wins = my_driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[1]')


            #Get the director of the movie
            direct = my_driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[3]/ul/li[1]/div')
        
        
            #Appends the release date to the list above
            release_date.append(release[0].text)


            if rates.text == 'Rate':
                #Appends NONE to the Rating list above
                rating.append(None)
            else:
                #Appends the rates to the rating list above
                rating.append(rates.text[:3])


            #Appends the Total Votes for movies that have ratings
            total_votes.append(rates.text[8:])

            #Appends the country of Origin (Country where the movie was made in)
            country_of_origin.append(release[1].text)


            if 'Award' in wins.text:
                #Appends the number of wins and awards to the 'award_nom_wins' list above
                award_nom_wins.append(wins.text)
            else:
                #Appends NONE to the 'award_nom_wins' list above
                award_nom_wins.append(None)


            #Appends the directors to the 'Directors' list above
            directors.append(direct[0].text)



            new_entry = [title_[title_element.index(url)],title_element[title_element.index(url)],release_date[-1],rating[-1],total_votes[-1],award_nom_wins[-1],country_of_origin[-1],directors[-1]]
            file_sec = open(os.path.join(my_path,my_file), 'a')
            writer_object = csv.writer(file_sec)
            writer_object.writerow(new_entry)

            file_sec.close()
         
    #this is to actually return my complete dataframe
    dickens = {'Title':title_, 'Link':title_element, 'Release Date':release_date,
             'Rating':rating, 'Total Votes':total_votes, 'Awards Wins & Nominations':award_nom_wins,
             'Country of Origin':country_of_origin, 'Director':directors}
    dickens
    df.from_dict(dickens)
    return df

#Run the Scrape Data function
#scrape_data()


# While the direction for analysis on this data is yet to be decided, the dataset is a very interestig one and there a lot of areas to go with this.

## Updates will follow on what Insights I will like to glean form this dataset.
#Anticipate!!