### Web scraping code for 'Top Lifetime Grosses' by Box Office Mojo (IMDbPro)

In [122]:
# The website URL_link: https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW
from selenium import webdriver
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests 
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
import pandas as pd
from tqdm import tqdm, trange

In [123]:
s = Service("C:/Users/Владислав/Desktop/ChromeDriver/chromedriver.exe")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-notifications")
# chrome_options.add_argument("--headless") # If necessary, we can make the headless browser
browser = webdriver.Chrome(service = s, options=chrome_options)
browser.maximize_window()
browser.get("https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW")
browser.implicitly_wait(10)
sleep(5)

start = datetime.now() # Let's measure the time of accumulating all URL_links of 1000 films

URL_links = [] # The list container for URL_links
page = browser.page_source
soup = BeautifulSoup(page)
for link in soup.find_all("a", class_ = "a-link-normal"):
    if link.get("href").startswith("/title/"):
    # We extract only the part of the URL_link. To obtain the whole URL_link, we should utilize the concatenation 
    # operation:
        whole_link = "https://www.boxofficemojo.com" + link.get("href")
        URL_links.append(whole_link) # All URL_links are appended to the rear of the list container
# With the assistance of Selenium we will click the next_page button, while such an option is valid:
while True:
    try:
        next_page = browser.find_element(By.CSS_SELECTOR, "#a-page > main > div > div > div.a-text-center.mojo-pagination > ul > li.a-last > a")
        next_page.click()
        sleep(3)
        
        page = browser.page_source
        soup = BeautifulSoup(page)
        for link in soup.find_all("a", class_ = "a-link-normal"):
            if link.get("href").startswith("/title/"):
                whole_link = "https://www.boxofficemojo.com" + link.get("href")
                URL_links.append(whole_link)
        
        WebDriverWait(browser, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#a-page > main > div > div > div.a-text-center.mojo-pagination > ul > li.a-last > a")))
    # All the relevant exceptions are taken into consideration:    
    except TimeoutException:
        print(f"TimeoutException! All URL-links were successfully scraped for the following time: {datetime.now() - start}")
        break
    except NoSuchElementException:
        print(f"NoSuchElement! All URL-links were successfully scraped for the following time: {datetime.now() - start}")
        break 
    except ElementClickInterceptedException:
        print(f"ElementClickIntercepted! All URL-links were successfully scraped for the following time: {datetime.now() - start}")
        break
    # When an exception is raised - the 'while loop' is broken, and it means that all URL-links are 
    # successfully scraped

TimeoutException! All URL-links were successfully scraped for the following time: 0:00:51.683316


In [140]:
print(len(URL_links)) # All URL_links of films were successfully scraped
# Let's return the first 20 URL_links of films
URL_links[:20] # The slice of the list container with 20 URL_links

1000


['https://www.boxofficemojo.com/title/tt0499549/?ref_=bo_cso_table_1',
 'https://www.boxofficemojo.com/title/tt4154796/?ref_=bo_cso_table_2',
 'https://www.boxofficemojo.com/title/tt0120338/?ref_=bo_cso_table_3',
 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_4',
 'https://www.boxofficemojo.com/title/tt4154756/?ref_=bo_cso_table_5',
 'https://www.boxofficemojo.com/title/tt10872600/?ref_=bo_cso_table_6',
 'https://www.boxofficemojo.com/title/tt0369610/?ref_=bo_cso_table_7',
 'https://www.boxofficemojo.com/title/tt6105098/?ref_=bo_cso_table_8',
 'https://www.boxofficemojo.com/title/tt0848228/?ref_=bo_cso_table_9',
 'https://www.boxofficemojo.com/title/tt2820852/?ref_=bo_cso_table_10',
 'https://www.boxofficemojo.com/title/tt4520988/?ref_=bo_cso_table_11',
 'https://www.boxofficemojo.com/title/tt2395427/?ref_=bo_cso_table_12',
 'https://www.boxofficemojo.com/title/tt1825683/?ref_=bo_cso_table_13',
 'https://www.boxofficemojo.com/title/tt1201607/?ref_=bo_cso_table_14',


In [181]:
# The following operation is to loop over the scraped URL_links of films and extract all the relevant information, 
# pertaining to the below-mentioned categories:
film_titles = [] # The list container for film titles 
film_years = [] # The list container for production years of films
description = [] # The list container, which stores the film plot/description
domestic_box_office = [] # The list, which stores the box office within the country of origin
international_box_office = [] # The list, which stores the box office abroad
worldwide_box_office = [] # The list, which encompasses the box office all around the world (the total box office)

# Extraction of the relevant category information is carried out through BeautifulSoup and requests library
for link in tqdm(URL_links):
    url = link
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    # Information, regarding the film production year:
    year = soup.find_all("span", class_ = "a-size-large a-color-secondary")[0].get_text().strip()
    # Information, regarding the film title:
    title = soup.h1.get_text().strip().replace(year, "").strip()
    # Information, concerning the film plot description:
    plot = soup.find_all("span", "a-size-medium")[0].get_text().strip()
    # Box office within the country of film origin:
    dom_gross = soup.find_all("span", class_ = "money")[0].get_text().strip().lstrip("$").replace(",", " ")
    # International box office (beyond the country of film origin):
    int_gross = soup.find_all("span", class_ = "money")[1].get_text().strip().lstrip("$").replace(",", " ")
    # Global box office - all around the world (international + domestic box office):
    world_gross = soup.find_all("span", class_ = "money")[2].get_text().strip().lstrip("$").replace(",", " ")

    # All the category information is appended to the relevant list containers
    film_titles.append(title) # Film titles
    film_years.append(year.lstrip("(").rstrip(")")) # Film production years
    description.append(plot) # Plot/description of films
    # Besides, the three-level film box office:
    domestic_box_office.append(dom_gross)
    international_box_office.append(int_gross)
    worldwide_box_office.append(world_gross)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [26:10<00:00,  1.57s/it]


In [184]:
# The following step is to create the zip_list, where all categories are combined, merged within iterables:
# It will facilitate the further iteration of multiple categories
zip_list = list(zip(film_titles, film_years, description, domestic_box_office, international_box_office, worldwide_box_office))
zip_list[:5] # The slice of the created zip_list

[('Avatar',
  '2009',
  'A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.',
  '760 507 625',
  '2 086 889 714',
  '2 847 397 339'),
 ('Avengers: Endgame',
  '2019',
  "After the devastating events of Avengers: Infinity War, the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",
  '858 373 000',
  '1 939 128 328',
  '2 797 501 328'),
 ('Titanic',
  '1997',
  'A seventeen-year-old aristocrat falls in love with a kind but poor artist aboard the luxurious, ill-fated R.M.S. Titanic.',
  '659 363 944',
  '1 542 283 320',
  '2 201 647 264'),
 ('Star Wars: Episode VII - The Force Awakens',
  '2015',
  'As a new threat to the galaxy rises, Rey, a desert scavenger, and Finn, an ex-stormtrooper, must join Han Solo and Chewbacca to search for the one hope of restoring peace.',


In [193]:
# The final operation is to create the dataframe with all the relevant information, pertaining to multiple categories
with open(r"Mojo_Dataset - Top 1000 Lifetime Film Grosses.csv", mode="w", encoding="utf-8") as file:
    for i, j, x, y, k, v in zip_list:
        file.write(f"{i};{j};{x};{y};{k};{v}" + "\n")
# The title of dataframe columns is incorporated as well:
columns = ["Title", "Year", "Plot", "Domestic_box_office", "International_box_office", "Worldwide_box_office"]

df = pd.read_csv("Mojo_Dataset - Top 1000 Lifetime Film Grosses.csv", delimiter = ";", names = columns)
# In view of the fact that the obtained dataframe is large enough - not all the content is fully displayed.
# We need to implement some changes:
pd.set_option("display.max_rows", None) # All dataframe rows are displayed
pd.set_option("display.max_colwidth", None) # All the content within dataframe cells is displayed
pd.set_option("display.max_columns", None) # All dataframe columns are displayed
df.head(5) # Below is the created dataframe (the first 5 rows are displayed):

Unnamed: 0,Title,Year,Plot,Domestic_box_office,International_box_office,Worldwide_box_office
0,Avatar,2009,A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.,760 507 625,2 086 889 714,2 847 397 339
1,Avengers: Endgame,2019,"After the devastating events of Avengers: Infinity War, the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",858 373 000,1 939 128 328,2 797 501 328
2,Titanic,1997,"A seventeen-year-old aristocrat falls in love with a kind but poor artist aboard the luxurious, ill-fated R.M.S. Titanic.",659 363 944,1 542 283 320,2 201 647 264
3,Star Wars: Episode VII - The Force Awakens,2015,"As a new threat to the galaxy rises, Rey, a desert scavenger, and Finn, an ex-stormtrooper, must join Han Solo and Chewbacca to search for the one hope of restoring peace.",936 662 225,1 132 859 475,2 069 521 700
4,Avengers: Infinity War,2018,The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.,678 815 482,1 369 544 272,2 048 359 754
