In [1]:
import pandas as pd
import wbgapi as wb
import numpy as np
import requests
from bs4 import BeautifulSoup

## World bank data

### Import GNI per capita and Gini coefficient for each country from world bank 

In [2]:
df_raw = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI'], time=range(2000,2023), labels=True, skipBlanks =False, columns='series', skipAggs=True)

In [3]:
df = df_raw.copy()

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Country,Time,NY.GNP.PCAP.CD,SI.POV.GINI
economy,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ZWE,YR2021,Zimbabwe,2021,1530.0,
ZWE,YR2020,Zimbabwe,2020,1460.0,
ZWE,YR2019,Zimbabwe,2019,1450.0,50.3
ZWE,YR2018,Zimbabwe,2018,1550.0,
ZWE,YR2017,Zimbabwe,2017,1170.0,44.3
...,...,...,...,...,...
AFG,YR2004,Afghanistan,2004,,
AFG,YR2003,Afghanistan,2003,,
AFG,YR2002,Afghanistan,2002,,
AFG,YR2001,Afghanistan,2001,,


### Reset index, rename columns and  convert to datetime

In [4]:
df = df.reset_index()

df = df.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country'})

df['year'] = pd.to_datetime(df['year'], format='%Y')

df = df.drop(columns='time')

df = df.set_index(['iso3', 'year'])

In [5]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZWE,2021-01-01,Zimbabwe,1530.0,
ZWE,2020-01-01,Zimbabwe,1460.0,
ZWE,2019-01-01,Zimbabwe,1450.0,50.3
ZWE,2018-01-01,Zimbabwe,1550.0,
ZWE,2017-01-01,Zimbabwe,1170.0,44.3
...,...,...,...,...
AFG,2004-01-01,Afghanistan,,
AFG,2003-01-01,Afghanistan,,
AFG,2002-01-01,Afghanistan,,
AFG,2001-01-01,Afghanistan,,


In [6]:
df.isna().sum()

country       0
gnipc       539
gini       3317
dtype: int64

## Import CIA factbook data

### Imports gini data from CIA factbook

In [7]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

In [8]:
r = requests.get(url)

soup = BeautifulSoup(r.text, 'lxml')

### Finds first table on page

In [9]:
table = soup.findAll('table')[0]

### Extracts headers from table

In [10]:
headers = [x.text for x in table.findChildren('th')[1:]]

### Adds headers to first element in list, then forms a new list per row in table before appending them all into a list of lists

In [11]:
row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

In [12]:
df_cia = pd.DataFrame(table_list)

### Adds gini label to first row in dataframe before promoting column labels to headers

In [13]:
df_cia.iloc[0,1]='gini'

df_cia.columns= df_cia.iloc[0,:]

df_cia = df_cia[1:]

### Renames columns, removes est. label from year and sets datatypes

In [14]:
df_cia = df_cia.rename(columns= {'Country': 'country', 'Date of Information': 'year'})

df_cia['year'] =df_cia['year'].str[:4]

df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')

df_cia = df_cia.astype({'country': 'string', 'gini' : 'float64'})

In [15]:
df_cia

Unnamed: 0,country,gini,year
1,South Africa,63.0,2014-01-01
2,Namibia,59.1,2015-01-01
3,Zambia,57.1,2015-01-01
4,Central African Republic,56.2,2008-01-01
5,Eswatini,54.6,2016-01-01
...,...,...,...
173,Slovakia,25.2,2016-01-01
174,Belarus,24.4,2020-01-01
175,Slovenia,24.2,2017-01-01
176,Faroe Islands,22.7,2013-01-01


### Generates iso3 codes for countries in cia dataset, drops any rows that are NA and sets the index to iso3 code and year

In [16]:
df_cia['iso3'] = wb.economy.coder(df_cia['country'])

df_cia = df_cia.dropna()

df_cia = df_cia.set_index(['iso3', 'year'])

In [17]:
df_cia

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZAF,2014-01-01,South Africa,63.0
NAM,2015-01-01,Namibia,59.1
ZMB,2015-01-01,Zambia,57.1
CAF,2008-01-01,Central African Republic,56.2
SWZ,2016-01-01,Eswatini,54.6
...,...,...,...
ARM,2020-01-01,Armenia,25.2
SVK,2016-01-01,Slovakia,25.2
BLR,2020-01-01,Belarus,24.4
SVN,2017-01-01,Slovenia,24.2


In [18]:
df.isna().sum()

country       0
gnipc       539
gini       3317
dtype: int64

### Fills in any blanks in world bank dataset Gini column with the corresponding values in the cia dataset. Matches on iso3 column and year

In [19]:
df['gini'] = df['gini'].fillna(df_cia['gini'])

In [20]:
df = df.sort_index()

In [21]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,2000-01-01,Aruba,20390.0,
ABW,2001-01-01,Aruba,20510.0,
ABW,2002-01-01,Aruba,19290.0,
ABW,2003-01-01,Aruba,21120.0,
ABW,2004-01-01,Aruba,24010.0,
...,...,...,...,...
ZWE,2017-01-01,Zimbabwe,1170.0,44.3
ZWE,2018-01-01,Zimbabwe,1550.0,
ZWE,2019-01-01,Zimbabwe,1450.0,50.3
ZWE,2020-01-01,Zimbabwe,1460.0,


In [22]:
df.isna().sum()

country       0
gnipc       539
gini       3300
dtype: int64

In [23]:
df.dtypes

country     object
gnipc      float64
gini       float64
dtype: object

In [24]:
df.to_csv(r'processed_data/wb_cia_combined_economic.csv')