In [None]:
#Project Scenario:
#An international firm that is looking to expand its business in different countries across the world has recruited you. 
#You have been hired as a junior Data Engineer and are tasked with creating a script that can extract the list of the 
#top 10 largest economies of the world in descending order of their 
#GDPs in Billion USD (rounded to 2 decimal places), as logged by the International Monetary Fund (IMF).

#The required data seems to be available on the URL mentioned below:

In [144]:
#Install required packages
!pip install pandas numpy 
!pip install lxml
import numpy as np
import pandas as pd

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [313]:
URL="https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
df=pd.read_html(URL)
table_df=df[3]
table_df


Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Estimate,Year,Estimate,Year,Estimate,Year
0,World,—,105568776,2023,100562011,2022,96698005,2021
1,United States,Americas,26854599,2023,25462700,2022,23315081,2021
2,China,Asia,19373586,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4231141,2022,4940878,2021
4,Germany,Europe,4308854,2023,4072192,2022,4259935,2021
...,...,...,...,...,...,...,...,...
209,Anguilla,Americas,—,—,—,—,303,2021
210,Kiribati,Oceania,248,2023,223,2022,227,2021
211,Nauru,Oceania,151,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [315]:
# Replace the column headers with column numbers
table_df.columns = range(table_df.shape[1])
table_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,World,—,105568776,2023,100562011,2022,96698005,2021
1,United States,Americas,26854599,2023,25462700,2022,23315081,2021
2,China,Asia,19373586,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4231141,2022,4940878,2021
4,Germany,Europe,4308854,2023,4072192,2022,4259935,2021
...,...,...,...,...,...,...,...,...
209,Anguilla,Americas,—,—,—,—,303,2021
210,Kiribati,Oceania,248,2023,223,2022,227,2021
211,Nauru,Oceania,151,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [317]:
# Retain columns with index 0 and 2 (name of country and value of GDP quoted by IMF)
table_df=table_df[[0,2]]
table_df

Unnamed: 0,0,2
0,World,105568776
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
...,...,...
209,Anguilla,—
210,Kiribati,248
211,Nauru,151
212,Montserrat,—


In [319]:

# Retain the Rows with index 1 to 10, indicating the top 10 economies of the world.
table_df=table_df.iloc[1:11,:]
table_df


Unnamed: 0,0,2
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [321]:
# Assign column names as "Country" and "GDP (Million USD)"
table_df.columns=["country","GDP (Million USD)"]
table_df


Unnamed: 0,country,GDP (Million USD)
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [323]:
def clean_GDP(GDP):
    try:
        clean_GDP_values=GDP.replace(",","").strip()# remove commas and whitespaces
        if clean_GDP_values=="" or clean_GDP_values=="-" or clean_GDP_values=="_":
            return np.nan #for missing values
        return float(clean_GDP_values) #converting to float first 
    except(ValueError, TypeError):
        return np.nan

## Change the data type of the 'GDP (Million USD)' column to integer. Use astype() method.
table_df['GDP (Million USD)']=table_df['GDP (Million USD)'].apply(clean_GDP)
table_df['GDP (Million USD)']=table_df['GDP (Million USD)'].astype('Int64')#Int64 handles NaNs
table_df


Unnamed: 0,country,GDP (Million USD)
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [325]:
# Rename the column header from 'GDP (Million USD)' to 'GDP (Billion USD)'
table_df.rename(columns={'GDP (Million USD)' : 'GDP (Billion USD)'}, inplace=True)
table_df


Unnamed: 0,country,GDP (Billion USD)
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [327]:
# Convert the GDP value in Million USD to Billion USD
table_df['GDP (Billion USD)']=table_df['GDP (Billion USD)']/1000
table_df

Unnamed: 0,country,GDP (Billion USD)
1,United States,26854.599
2,China,19373.586
3,Japan,4409.738
4,Germany,4308.854
5,India,3736.882
6,United Kingdom,3158.938
7,France,2923.489
8,Italy,2169.745
9,Canada,2089.672
10,Brazil,2081.235


In [329]:
# Use numpy.round() method to round the value to 2 decimal places.
table_df['GDP (Billion USD)']=table_df['GDP (Billion USD)'].round(2)

table_df

Unnamed: 0,country,GDP (Billion USD)
1,United States,26854.6
2,China,19373.59
3,Japan,4409.74
4,Germany,4308.85
5,India,3736.88
6,United Kingdom,3158.94
7,France,2923.49
8,Italy,2169.74
9,Canada,2089.67
10,Brazil,2081.24


In [334]:
table_df.to_csv("Largest_economies.csv")
table_df

Unnamed: 0,country,GDP (Billion USD)
1,United States,26854.6
2,China,19373.59
3,Japan,4409.74
4,Germany,4308.85
5,India,3736.88
6,United Kingdom,3158.94
7,France,2923.49
8,Italy,2169.74
9,Canada,2089.67
10,Brazil,2081.24
