In [1]:
#Extracting data from a website using webscraping and request APIs to process it using Pandas and Numpy libraries

In [69]:
#the url of the website 
URL="https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"

In [71]:
#Installing required packages
!pip install pandas numpy 
!pip install lxml



In [73]:
#importing required libraries
import numpy as np
import pandas as pd

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [75]:
#extracting the tables from webpase using Pandas.
tables=pd.read_html(URL)
tables

[      0     1     2
 0   Aug   SEP   Oct
 1   NaN    02   NaN
 2  2022  2023  2024,
                                                    0
 0  Largest economies in the world by GDP (nominal...,
                                                    0  \
 0  > $20 trillion $10–20 trillion $5–10 trillion ...   
 
                                                    1  \
 0  $750 billion – $1 trillion $500–750 billion $2...   
 
                                                    2  
 0  $50–100 billion $25–50 billion $5–25 billion <...  ,
     Country/Territory UN region IMF[1][13]            World Bank[14]  \
     Country/Territory UN region   Estimate       Year       Estimate   
 0               World         —  105568776       2023      100562011   
 1       United States  Americas   26854599       2023       25462700   
 2               China      Asia   19373586  [n 1]2023       17963171   
 3               Japan      Asia    4409738       2023        4231141   
 4             Germany 

In [77]:
#retaining the table number 3 as the required dataframe
df=tables[3]
df

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Estimate,Year,Estimate,Year,Estimate,Year
0,World,—,105568776,2023,100562011,2022,96698005,2021
1,United States,Americas,26854599,2023,25462700,2022,23315081,2021
2,China,Asia,19373586,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4231141,2022,4940878,2021
4,Germany,Europe,4308854,2023,4072192,2022,4259935,2021
...,...,...,...,...,...,...,...,...
209,Anguilla,Americas,—,—,—,—,303,2021
210,Kiribati,Oceania,248,2023,223,2022,227,2021
211,Nauru,Oceania,151,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [79]:
#Replace the column headers with column numbers
df.columns=range(df.shape[1])
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,World,—,105568776,2023,100562011,2022,96698005,2021
1,United States,Americas,26854599,2023,25462700,2022,23315081,2021
2,China,Asia,19373586,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4231141,2022,4940878,2021
4,Germany,Europe,4308854,2023,4072192,2022,4259935,2021
...,...,...,...,...,...,...,...,...
209,Anguilla,Americas,—,—,—,—,303,2021
210,Kiribati,Oceania,248,2023,223,2022,227,2021
211,Nauru,Oceania,151,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [81]:
#Retaining the columns with index 0 and 2
# name of country and value fo GDP quoted by IMF
df=df[[0,2]]
df

Unnamed: 0,0,2
0,World,105568776
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
...,...,...
209,Anguilla,—
210,Kiribati,248
211,Nauru,151
212,Montserrat,—


In [83]:
#Retaining the Rows with index 1 to 10, indicating the top 1p economies of the world 
df=df.iloc[1:11, :]
df

Unnamed: 0,0,2
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [85]:
#Assigning column names as "Country" and "GDP" (Million USD)
df.columns=['Country', 'GDP (Million USD)']
df

Unnamed: 0,Country,GDP (Million USD)
1,United States,26854599
2,China,19373586
3,Japan,4409738
4,Germany,4308854
5,India,3736882
6,United Kingdom,3158938
7,France,2923489
8,Italy,2169745
9,Canada,2089672
10,Brazil,2081235


In [87]:
#Changing the data type of GDP column to integer, using astype()
df['GDP (Million USD)']=df['GDP (Million USD)'].astype(int)
df['GDP (Million USD)']

1     26854599
2     19373586
3      4409738
4      4308854
5      3736882
6      3158938
7      2923489
8      2169745
9      2089672
10     2081235
Name: GDP (Million USD), dtype: int64

In [89]:
#converting the GDP value in Million USD to Billion USD
df['GDP (Million USD)']=df['GDP (Million USD)']/1000
df['GDP (Million USD)']

1     26854.599
2     19373.586
3      4409.738
4      4308.854
5      3736.882
6      3158.938
7      2923.489
8      2169.745
9      2089.672
10     2081.235
Name: GDP (Million USD), dtype: float64

In [91]:
#using numpy.round() method to round the value to 2 decimal places
df['GDP (Million USD)']=np.round(df[['GDP (Million USD)']], 2)
df['GDP (Million USD)']

1     26854.60
2     19373.59
3      4409.74
4      4308.85
5      3736.88
6      3158.94
7      2923.49
8      2169.74
9      2089.67
10     2081.24
Name: GDP (Million USD), dtype: float64

In [93]:
#renaming the column header from 'GDP (Million USD)' to 'GDP (Billion USD)'
df.rename(columns={'GDP (Million USD)' : 'GDP (Billion USD)'})

Unnamed: 0,Country,GDP (Billion USD)
1,United States,26854.6
2,China,19373.59
3,Japan,4409.74
4,Germany,4308.85
5,India,3736.88
6,United Kingdom,3158.94
7,France,2923.49
8,Italy,2169.74
9,Canada,2089.67
10,Brazil,2081.24


In [99]:
#Loading the dataframe to CSVfile named "Largest_economies.csv"
csv_file=df.to_csv('./Largest_economies.csv')