# Beautifulsoup Working with Sibling Tables

In [3]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import csv

In [4]:
# Step 1: Sending a HTTP request to a URL
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

In [5]:
# Step 2: Parse the html content
soup = BeautifulSoup(html_content, "lxml")

In [6]:
# print the parsed data of html with prettify
print(soup.prettify()) 

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by GDP (nominal) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"0883ce93-2f35-4600-9b29-fb5d0a358b67","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_GDP_(nominal)","wgTitle":"List of countries by GDP (nominal)","wgCurRevisionId":965674940,"wgRevisionId":965674940,"wgArticleId":380845,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using Timeline","Wikipedia indefinitely semi-protected pages","Articles with short 

In [7]:
# Step 3: Analyze the HTML tag, where your content lives
# Create a data dictionary to store the data.
data = {}

In [8]:
# Get the table having the class wikitable
gdp_table = soup.find("table", attrs={"class": "wikitable"})
gdp_table_data = gdp_table.tbody.find_all("tr")  

In [91]:
# contains 2 rows
# one for headings
# one for tables 
# print(gdp_table_data)

In [92]:
# Get all the headings of Lists
print(gdp_table_data[0])

<tr>
<td style="width:33%; text-align:center;"><b>Per the <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">International Monetary Fund</a> (2019 estimates)</b><sup class="reference" id="cite_ref-GDP_IMF_1-2"><a href="#cite_note-GDP_IMF-1">[1]</a></sup>
</td>
<td style="width:33%; text-align:center;"><b>Per the <a href="/wiki/World_Bank" title="World Bank">World Bank</a> (2019)</b><sup class="reference" id="cite_ref-worldbank_21-0"><a href="#cite_note-worldbank-21">[20]</a></sup>
</td>
<td style="width:33%; text-align:center;"><b>Per the <a href="/wiki/United_Nations" title="United Nations">United Nations</a> (2017)</b><sup class="reference" id="cite_ref-22"><a href="#cite_note-22">[21]</a></sup><sup class="reference" id="cite_ref-23"><a href="#cite_note-23">[22]</a></sup>
</td></tr>


In [93]:
# create empty list
headings = []

for td in gdp_table_data[0].find_all("td"):
    # remove any newlines and extra spaces from left and right
    # append headings to dictionary
    headings.append(td.b.text.replace('\n', ' ').strip())
    
print(headings)

['Per the International Monetary Fund (2019 estimates)', 'Per the World Bank (2019)', 'Per the United Nations (2017)']


In [94]:
# get the table data
gdp_table_data[1]

<tr valign="top">
<td>
<table class="wikitable sortable" style="margin-left:auto; margin-right:auto; margin-top:0;">
<tbody><tr>
<th data-sort-type="number" style="width:2em;">Rank</th>
<th>Country/Territory</th>
<th>GDP<br/>(US$million)
</th></tr>
<tr style="font-weight:bold;background:#eaecf0">
<td data-sort-value="-1"> </td>
<td data-sort-value=""><span class="flagicon" style="padding-left:25px;"> </span><i><a href="/wiki/Gross_world_product" title="Gross world product">World</a></i><sup class="reference" id="cite_ref-IMF_Groups_20-1"><a href="#cite_note-IMF_Groups-20">[19]</a></sup></td>
<td align="right">87,265,226
</td></tr>
<tr>
<td align="right">1</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/23px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_S

In [95]:
# Get all the 3 tables contained in "gdp_table"
for table, heading in zip(gdp_table_data[1].find_all("table"), headings):
    # Get headers of table i.e., Rank, Country, GDP.
    t_headers = []
    for th in table.find_all("th"):
        # remove any newlines and extra spaces from left and right
        t_headers.append(th.text.replace('\n', '').strip())

    # Get all the rows of table
    table_data = []
    for tr in table.tbody.find_all("tr"):  # find all tr's from table's tbody
        # create dictionaries for rows
        t_row = {}
        
        # Each table row is stored in the form of
        # t_row = {'Rank': '', 'Country/Territory': '', 'GDP(US$million)': ''}
        # find all td's(3) in tr and zip it with t_header
        for td, th in zip(tr.find_all("td"), t_headers):
            t_row[th] = td.text.replace('\n', '').strip()
        table_data.append(t_row)

    # Put the data for the table with his heading.
    data[heading] = table_data
    
    print(table_data)

[{}, {'Rank': '', 'Country/Territory': 'World[19]', 'GDP(US$million)': '87,265,226'}, {'Rank': '1', 'Country/Territory': 'United States', 'GDP(US$million)': '21,439,453'}, {'Rank': '—', 'Country/Territory': 'European Union[23][n 1]', 'GDP(US$million)': '18,705,132'}, {'Rank': '2', 'Country/Territory': 'China[n 2]', 'GDP(US$million)': '14,140,163'}, {'Rank': '3', 'Country/Territory': 'Japan', 'GDP(US$million)': '5,154,475'}, {'Rank': '4', 'Country/Territory': 'Germany', 'GDP(US$million)': '3,863,344'}, {'Rank': '5', 'Country/Territory': 'India', 'GDP(US$million)': '2,935,570'}, {'Rank': '6', 'Country/Territory': 'United Kingdom', 'GDP(US$million)': '2,743,586'}, {'Rank': '7', 'Country/Territory': 'France', 'GDP(US$million)': '2,707,074'}, {'Rank': '8', 'Country/Territory': 'Italy', 'GDP(US$million)': '1,988,636'}, {'Rank': '9', 'Country/Territory': 'Brazil', 'GDP(US$million)': '1,847,020'}, {'Rank': '10', 'Country/Territory': 'Canada', 'GDP(US$million)': '1,730,914'}, {'Rank': '11', 'Co

In [73]:
# Step 4: Export the data to csv
"""
For this example let's create 3 seperate csv for 
3 tables respectively
"""
for topic, table in data.items():
    # Create csv file for each table
    with open(f"{topic}.csv", 'w') as out_file:
        # Each 3 table has headers as following
        headers = [
            "Country/Territory",
            "GDP(US$million)",
            "Rank"
        ]  # == t_headers
        writer = csv.DictWriter(out_file, headers)
        # write the header
        writer.writeheader()
        for row in table:
            if row:
                writer.writerow(row)