In [1]:
from bs4 import BeautifulSoup
import requests

page = requests.get("https://en.wikipedia.org/wiki/List_of_highest-grossing_films")
soup = BeautifulSoup(page.content)

print(soup.prettify()[:1000])

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of highest-grossing films - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"a91edc8b-fe31-46b2-82a2-16cbb24a5e94","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_highest-grossing_films","wgTitle":"List of highest-grossing films","wgCurRevisionId":961013473,"wgRevisionId":961013473,"wgArticleId":59892,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 uses Russian-language script (ru)","CS1 Russian-language sources (ru)","CS1 errors: missing period

In [2]:
print(soup.title.string)

List of highest-grossing films - Wikipedia


In [3]:
film_table = soup.find("table", class_="wikitable")

def clean_string(string):
    if string:
        cleaned_string = string.strip().replace('\n', ' ').strip();
        return cleaned_string
    
    return string

all_results = [];

for row in film_table.find_all("tr"):
    columns = row.find_all(["td", "th"])
    if(columns[0].get("scope")):
        continue;
        
    rank = clean_string(columns[0].text)
    title = clean_string(columns[2].text)
    grossing = clean_string(columns[3].text)
    year = clean_string(columns[4].text)
    
    link_anchor = columns[2].find("a")
    if(link_anchor):
        link_detail = clean_string(link_anchor['href'])
    
    result = (rank, title, link_detail, grossing, year)
    all_results.append(result)

print(all_results)


[('1', 'Avengers: Endgame', '/wiki/Avengers:_Endgame', '$2,797,800,564', '2019'), ('2', 'Avatar', '/wiki/Avatar_(2009_film)', '$2,790,439,000', '2009'), ('3', 'Titanic', '/wiki/Titanic_(1997_film)', '$2,194,439,542', '1997'), ('4', 'Star Wars: The Force Awakens', '/wiki/Star_Wars:_The_Force_Awakens', '$2,068,223,624', '2015'), ('5', 'Avengers: Infinity War', '/wiki/Avengers:_Infinity_War', '$2,048,359,754', '2018'), ('6', 'Jurassic World', '/wiki/Jurassic_World', '$1,671,713,208', '2015'), ('7', 'The Lion King', '/wiki/The_Lion_King_(2019_film)', '$1,656,943,394', '2019'), ('8', 'The Avengers', '/wiki/The_Avengers_(2012_film)', '$1,518,812,988', '2012'), ('9', 'Furious 7', '/wiki/Furious_7', '$1,516,045,911', '2015'), ('10', 'Frozen II', '/wiki/Frozen_II', '$1,450,026,933', '2019'), ('11', 'Avengers: Age of Ultron', '/wiki/Avengers:_Age_of_Ultron', '$1,402,805,868', '2015'), ('12', 'Black Panther', '/wiki/Black_Panther_(film)', '$1,346,913,161', '2018'), ('13', 'Harry Potter and the De

In [4]:
import pandas as pd

headers = ['rank', 'Title', 'link_detail', "Worldwide gross ($USD)", "Year"]

df = pd.DataFrame(all_results, columns=headers)

df.head(50)

Unnamed: 0,rank,Title,link_detail,Worldwide gross ($USD),Year
0,1,Avengers: Endgame,/wiki/Avengers:_Endgame,"$2,797,800,564",2019
1,2,Avatar,/wiki/Avatar_(2009_film),"$2,790,439,000",2009
2,3,Titanic,/wiki/Titanic_(1997_film),"$2,194,439,542",1997
3,4,Star Wars: The Force Awakens,/wiki/Star_Wars:_The_Force_Awakens,"$2,068,223,624",2015
4,5,Avengers: Infinity War,/wiki/Avengers:_Infinity_War,"$2,048,359,754",2018
5,6,Jurassic World,/wiki/Jurassic_World,"$1,671,713,208",2015
6,7,The Lion King,/wiki/The_Lion_King_(2019_film),"$1,656,943,394",2019
7,8,The Avengers,/wiki/The_Avengers_(2012_film),"$1,518,812,988",2012
8,9,Furious 7,/wiki/Furious_7,"$1,516,045,911",2015
9,10,Frozen II,/wiki/Frozen_II,"$1,450,026,933",2019


In [5]:
def from_details_get_film_description(soup):

    main_content = soup.find(class_="mw-parser-output")
    description = ""
    for child in main_content.children:
        if(child.name == "h2"):
            break;
        if(child.name == "p"):
            description += "\n" + child.text

    description = description.strip()
#     print(description)
    return description


def from_details_get_film_director(soup):

    main_content = soup.find("table",  class_="infobox")
    for table_row in main_content.find_all("tr"):
        header = table_row.find('th')
        if(header and header.text == "Directed by"):
            column = table_row.find("td");
            director = clean_string(column.text)
            return director




def get_film_details(detail_url):

    page = requests.get("https://en.wikipedia.org"+detail_url)
    soup = BeautifulSoup(page.content)


    description = from_details_get_film_description(soup)
    director = from_details_get_film_director(soup)
    result = (description, director)
    return result




df["link_detail"].sample(1).map(get_film_details)



19    (Iron Man 3 is a 2013 American superhero film ...
Name: link_detail, dtype: object

In [6]:
from time import perf_counter

time1 = perf_counter()

result = df["link_detail"].map(get_film_details)

time2 = perf_counter()

print("time taken:", time2-time1)


time taken: 13.79202658


In [7]:
df['description'], df['director'] = zip(*result)

df.head()

Unnamed: 0,rank,Title,link_detail,Worldwide gross ($USD),Year,description,director
0,1,Avengers: Endgame,/wiki/Avengers:_Endgame,"$2,797,800,564",2019,Avengers: Endgame is a 2019 American superhero...,Anthony RussoJoe Russo
1,2,Avatar,/wiki/Avatar_(2009_film),"$2,790,439,000",2009,Avatar (marketed as James Cameron's Avatar) is...,James Cameron
2,3,Titanic,/wiki/Titanic_(1997_film),"$2,194,439,542",1997,Titanic is a 1997 American epic romance and di...,James Cameron
3,4,Star Wars: The Force Awakens,/wiki/Star_Wars:_The_Force_Awakens,"$2,068,223,624",2015,Star Wars: The Force Awakens (also known as St...,J. J. Abrams
4,5,Avengers: Infinity War,/wiki/Avengers:_Infinity_War,"$2,048,359,754",2018,Avengers: Infinity War is a 2018 American supe...,Anthony RussoJoe Russo


In [8]:
import multiprocessing as mp
time1 = perf_counter()

with mp.Pool(mp.cpu_count()) as pool:
    threaded_result = pool.map(get_film_details, df["link_detail"])

time2 = perf_counter()

print("time taken:", time2-time1)

df['description'], df['director'] = zip(*threaded_result)


time taken: 2.908397775000001


In [9]:
df.head(50)

Unnamed: 0,rank,Title,link_detail,Worldwide gross ($USD),Year,description,director
0,1,Avengers: Endgame,/wiki/Avengers:_Endgame,"$2,797,800,564",2019,Avengers: Endgame is a 2019 American superhero...,Anthony RussoJoe Russo
1,2,Avatar,/wiki/Avatar_(2009_film),"$2,790,439,000",2009,Avatar (marketed as James Cameron's Avatar) is...,James Cameron
2,3,Titanic,/wiki/Titanic_(1997_film),"$2,194,439,542",1997,Titanic is a 1997 American epic romance and di...,James Cameron
3,4,Star Wars: The Force Awakens,/wiki/Star_Wars:_The_Force_Awakens,"$2,068,223,624",2015,Star Wars: The Force Awakens (also known as St...,J. J. Abrams
4,5,Avengers: Infinity War,/wiki/Avengers:_Infinity_War,"$2,048,359,754",2018,Avengers: Infinity War is a 2018 American supe...,Anthony RussoJoe Russo
5,6,Jurassic World,/wiki/Jurassic_World,"$1,671,713,208",2015,Jurassic World is a 2015 American science fict...,Colin Trevorrow
6,7,The Lion King,/wiki/The_Lion_King_(2019_film),"$1,656,943,394",2019,The Lion King is a 2019 American musical drama...,Jon Favreau
7,8,The Avengers,/wiki/The_Avengers_(2012_film),"$1,518,812,988",2012,Marvel's The Avengers[6] (classified under the...,Joss Whedon
8,9,Furious 7,/wiki/Furious_7,"$1,516,045,911",2015,Furious 7 (titled onscreen and other territori...,James Wan
9,10,Frozen II,/wiki/Frozen_II,"$1,450,026,933",2019,"Frozen II, also known as Frozen 2, is a 2019 A...",Chris Buck Jennifer Lee


In [10]:
df.to_csv("Top grossing films.csv", columns=["rank", "Title", "Worldwide gross ($USD)" , "Year","description", "director"])