In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import numpy as np
from datetime import datetime

In [119]:
# Code for ETL operations on Country-GDP data

# Importing the required libraries

def extract(url, table_attribs):
    ''' This function extracts the required
    information from the website and saves it to a dataframe. The
    function returns the dataframe for further processing. '''
    
    gdp_dict = {table_attribs[0]:[], table_attribs[1]:[]}
    
    web_page = requests.get(url).text
    html = BeautifulSoup(web_page,'html.parser')
    table = html.find_all('tbody')[2]
    
    rows = table.find_all('tr')
    for row in rows:
    if (len(row.find_all('td')) == 0):
        continue
    elif (len(row.find_all('td')[0].find_all('a')) == 0):
        continue
    elif (row.find_all('td')[2].get_text() == "—"):
        continue
    else:
        for j in gdp_dict.keys():
            if j == table_attribs[0]:
                gdp_dict[j].append(row.find_all('td')[0].find_all('a')[0].get_text())
            if j == table_attribs[1]:
                gdp_dict[j].append(row.find_all('td')[2].get_text())
    df = pd.DataFrame(gdp_dict)
    return df

def transform(df):
    ''' This function converts the GDP information from Currency
    format to float value, transforms the information of GDP from
    USD (Millions) to USD (Billions) rounding to 2 decimal places.
    The function returns the transformed dataframe.'''

    return df

def load_to_csv(df, csv_path):
    ''' This function saves the final dataframe as a `CSV` file 
    in the provided path. Function returns nothing.'''

def load_to_db(df, sql_connection, table_name):
    ''' This function saves the final dataframe as a database table
    with the provided name. Function returns nothing.'''

def run_query(query_statement, sql_connection):
    ''' This function runs the stated query on the database table and
    prints the output on the terminal. Function returns nothing. '''

def log_progress(message):
    ''' This function logs the mentioned message at a given stage of the code execution to a log file. Function returns nothing'''

In [120]:
extract(url,table_attribs)

Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
187,Palau,262
188,Kiribati,248
189,Nauru,151
190,Tuvalu,65


In [3]:
url = 'https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
table_attribs = ['Country', 'GDP_USD_millions']
db_name = 'data\\World_Economies.db'
table_name = 'Countries_by_GDP'
csv_path = 'data\\Countries_by_GDP.csv'

In [4]:
web_page = requests.get(url).text

In [5]:
html = BeautifulSoup(web_page,'html.parser')

In [6]:
table = html.find_all('tbody')[2]

In [7]:
rows = table.find_all('tr')
rows

[<tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
 <th rowspan="2">Country/Territory
 </th>
 <th rowspan="2"><a href="/web/20230902185326/https://en.wikipedia.org/wiki/United_Nations_geoscheme" title="United Nations geoscheme">UN region</a>
 </th>
 <th colspan="2"><a href="/web/20230902185326/https://en.wikipedia.org/wiki/International_Monetary_Fund" title="International Monetary Fund">IMF</a><sup class="reference" id="cite_ref-GDP_IMF_2-2"><a href="#cite_note-GDP_IMF-2">[1]</a></sup><sup class="reference" id="cite_ref-15"><a href="#cite_note-15">[13]</a></sup>
 </th>
 <th colspan="2"><a href="/web/20230902185326/https://en.wikipedia.org/wiki/World_Bank" title="World Bank">World Bank</a><sup class="reference" id="cite_ref-16"><a href="#cite_note-16">[14]</a></sup>
 </th>
 <th colspan="2"><a href="/web/20230902185326/https://en.wikipedia.org/wiki/United_Nations" title="United Nations">United Nations</a><sup class="reference" id="cite_ref-UN_17-0"><a href="

In [77]:
len(rows[2].find_all('td')[0].find_all('a'))

0

In [74]:
rows[3].find_all('td')[0].find_all('a')

[<a href="/web/20230902185326/https://en.wikipedia.org/wiki/Economy_of_the_United_States" title="Economy of the United States">United States</a>]

In [54]:
rows[3].find_all('td')[2].get_text()

'26,854,599'

In [48]:
for row in rows[3:]:
    print(row.find_all('td')[0].find_all('a')[0].get_text())

United States
China
Japan
Germany
India
United Kingdom
France
Italy
Canada
Brazil
Russia
South Korea
Australia
Mexico
Spain
Indonesia
Netherlands
Saudi Arabia
Turkey
Switzerland
Taiwan
Poland
Argentina
Belgium
Sweden
Ireland
Thailand
Norway
Israel
Singapore
Austria
Nigeria
United Arab Emirates
Vietnam
Malaysia
Philippines
Bangladesh
Denmark
South Africa
Hong Kong
Egypt
Pakistan
Iran
Chile
Romania
Colombia
Czech Republic
Finland
Peru
Iraq
Portugal
New Zealand
Kazakhstan
Greece
Qatar
Algeria
Hungary
Kuwait
Ethiopia
Ukraine
Morocco
Slovakia
Ecuador
Dominican Republic
Puerto Rico
Kenya
Angola
Cuba
Oman
Guatemala
Bulgaria
Venezuela
Uzbekistan
Luxembourg
Tanzania
Turkmenistan
Croatia
Lithuania
Costa Rica
Uruguay
Panama
Ivory Coast
Sri Lanka
Serbia
Belarus
Azerbaijan
DR Congo
Slovenia
Ghana
Myanmar
Jordan
Tunisia
Uganda
Cameroon
Latvia
Sudan
Libya
Bolivia
Bahrain
Paraguay
Nepal
Estonia
Macau
El Salvador
Honduras
Papua New Guinea
Senegal
Cyprus
Cambodia
Zimbabwe
Zambia
Iceland
Bosnia and Herze

In [82]:
for row in rows[3:]:
    print(row.find_all('td')[2].get_text())

26,854,599
19,373,586
4,409,738
4,308,854
3,736,882
3,158,938
2,923,489
2,169,745
2,089,672
2,081,235
2,062,649
1,721,909
1,707,548
1,663,164
1,492,432
1,391,778
1,080,880
1,061,902
1,029,303
869,601
790,728
748,887
641,102
624,248
599,052
594,095
574,231
554,105
539,223
515,548
515,199
506,601
498,978
449,094
447,026
440,901
420,516
405,626
399,015
382,854
378,110
—
367,970
358,557
348,902
334,689
330,483
301,670
268,235
267,893
267,721
251,969
245,695
239,300
219,570
206,007
188,505
164,713
156,083
148,712
138,781
127,533
121,291
121,289
120,838
118,130
117,877
—
104,902
102,309
100,635
96,628
92,332
86,971
85,421
82,649
78,881
78,346
77,777
77,313
77,257
77,047
—
73,961
73,543
70,030
69,474
68,108
66,622
63,988
52,061
49,815
49,792
48,625
47,398
46,705
46,297
46,097
44,870
42,820
42,097
41,551
35,841
33,752
32,860
31,362
31,221
30,864
30,628
29,931
29,272
28,625
28,488
28,223
27,947
26,580
—
23,725
23,330
21,076
20,776
20,330
20,177
—
19,909
19,206
19,572
19,529
19,405
19,236
17,287

In [87]:
gdp_df = pd.DataFrame(columns=table_attribs)

In [89]:
table_attribs

['Country', 'GDP_USD_millions']

In [92]:
for row in rows:
    if (len(row.find_all('td')) == 0):
        continue
    elif (len(row.find_all('td')[0].find_all('a')) == 0):
        continue
    elif (row.find_all('td')[2].get_text() == "—"):
        continue
    else:
        gdp_country = pd.DataFrame({'Country':row.find_all('td')[0].find_all('a')[0].get_text(), 'GDP_USD_millions': row.find_all('td')[2].get_text()}, index = [0])
    gdp_df = pd.concat([gdp_df, gdp_country], ignore_index = True)

In [93]:
gdp_df

Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
186,Marshall Islands,291
187,Palau,262
188,Kiribati,248
189,Nauru,151


In [115]:
table_attribs

['Country', 'GDP_USD_millions']

In [116]:
gdp_dict = {'Country':[], 'GDP_USD_millions':[]}

for row in rows:
    if (len(row.find_all('td')) == 0):
        continue
    elif (len(row.find_all('td')[0].find_all('a')) == 0):
        continue
    elif (row.find_all('td')[2].get_text() == "—"):
        continue
    else:
        for j in gdp_dict.keys():
            if j == table_attribs[0]:
                gdp_dict[j].append(row.find_all('td')[0].find_all('a')[0].get_text())
            if j == table_attribs[1]:
                gdp_dict[j].append(row.find_all('td')[2].get_text())

In [117]:
gdp_dict

{'Country': ['United States',
  'China',
  'Japan',
  'Germany',
  'India',
  'United Kingdom',
  'France',
  'Italy',
  'Canada',
  'Brazil',
  'Russia',
  'South Korea',
  'Australia',
  'Mexico',
  'Spain',
  'Indonesia',
  'Netherlands',
  'Saudi Arabia',
  'Turkey',
  'Switzerland',
  'Taiwan',
  'Poland',
  'Argentina',
  'Belgium',
  'Sweden',
  'Ireland',
  'Thailand',
  'Norway',
  'Israel',
  'Singapore',
  'Austria',
  'Nigeria',
  'United Arab Emirates',
  'Vietnam',
  'Malaysia',
  'Philippines',
  'Bangladesh',
  'Denmark',
  'South Africa',
  'Hong Kong',
  'Egypt',
  'Iran',
  'Chile',
  'Romania',
  'Colombia',
  'Czech Republic',
  'Finland',
  'Peru',
  'Iraq',
  'Portugal',
  'New Zealand',
  'Kazakhstan',
  'Greece',
  'Qatar',
  'Algeria',
  'Hungary',
  'Kuwait',
  'Ethiopia',
  'Ukraine',
  'Morocco',
  'Slovakia',
  'Ecuador',
  'Dominican Republic',
  'Puerto Rico',
  'Kenya',
  'Angola',
  'Oman',
  'Guatemala',
  'Bulgaria',
  'Venezuela',
  'Uzbekistan',
  

In [118]:
pd.DataFrame(gdp_dict)

Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
186,Marshall Islands,291
187,Palau,262
188,Kiribati,248
189,Nauru,151
