In [191]:
import pandas as pd
import wbgapi as wb
import numpy as np
import requests
from bs4 import BeautifulSoup

## World bank data

### Import GNI per capita and Gini coefficient for each country from 2010-2023 from world bank 

In [230]:
df_raw = wb.data.DataFrame(['NY.GNP.PCAP.CD','SI.POV.GINI'], time=range(2010,2023), labels=True, skipBlanks =True, columns='series', skipAggs=True)

In [231]:
df = df_raw.copy()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Country,Time,NY.GNP.PCAP.CD,SI.POV.GINI
economy,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ZWE,YR2021,Zimbabwe,2021,1530.0,
ZWE,YR2020,Zimbabwe,2020,1460.0,
ZWE,YR2019,Zimbabwe,2019,1450.0,50.3
ZWE,YR2018,Zimbabwe,2018,1550.0,
ZWE,YR2017,Zimbabwe,2017,1170.0,44.3
...,...,...,...,...,...
AFG,YR2011,Afghanistan,2011,550.0,
AFG,YR2010,Afghanistan,2010,520.0,
SSD,YR2016,South Sudan,2016,,44.1
DJI,YR2013,Djibouti,2013,,44.1


### Reset index and rename columns and  convert to datetime

In [232]:
df = df.reset_index()

df = df.rename(columns={'economy':'iso3',
                        'Time':'year',
                        'NY.GNP.PCAP.CD':'gnipc',
                        'SI.POV.GINI':'gini',
                        'Country':'country'})

df['year'] = pd.to_datetime(df['year'], format='%Y')
df = df.drop(columns='time')

In [233]:
df

Unnamed: 0,iso3,country,year,gnipc,gini
0,ZWE,Zimbabwe,2021-01-01,1530.0,
1,ZWE,Zimbabwe,2020-01-01,1460.0,
2,ZWE,Zimbabwe,2019-01-01,1450.0,50.3
3,ZWE,Zimbabwe,2018-01-01,1550.0,
4,ZWE,Zimbabwe,2017-01-01,1170.0,44.3
...,...,...,...,...,...
2338,AFG,Afghanistan,2011-01-01,550.0,
2339,AFG,Afghanistan,2010-01-01,520.0,
2340,SSD,South Sudan,2016-01-01,,44.1
2341,DJI,Djibouti,2013-01-01,,44.1


In [196]:
#df = df.groupby(['iso3', 'country'])[['gnipc','gini']].mean()
#df = df.reset_index()
df = df.set_index(['iso3', 'year'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZWE,2021,Zimbabwe,1530.0,
ZWE,2020,Zimbabwe,1460.0,
ZWE,2019,Zimbabwe,1450.0,50.3
ZWE,2018,Zimbabwe,1550.0,
ZWE,2017,Zimbabwe,1170.0,44.3
...,...,...,...,...
AFG,2011,Afghanistan,550.0,
AFG,2010,Afghanistan,520.0,
SSD,2016,South Sudan,,44.1
DJI,2013,Djibouti,,44.1


In [197]:
df.isna().sum()

country       0
gnipc         3
gini       1525
dtype: int64

## Import CIA factbook data

### Imports gini data from CIA factbook

In [198]:
url = 'https://www.cia.gov/the-world-factbook/field/gini-index-coefficient-distribution-of-family-income/country-comparison'

In [199]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

### Finds first table on page

In [200]:
table = soup.findAll('table')[0]

### Extracts headers from table

In [201]:
headers = [x.text for x in table.findChildren('th')[1:]]

### Adds headers to first element in list, then forms a new list per row in table before appending them all into a list of lists

In [202]:
row_list = headers
table_list = []
for row in table.findChildren('tr'):

    for cell in row.findChildren('td')[1:]:
        row_list.append(cell.text)
    table_list.append(row_list)
    row_list = []

In [203]:
df_cia = pd.DataFrame(table_list)

### Adds gini label to first row in dataframe before promoting column labels to headers

In [204]:
df_cia.iloc[0,1]='gini'
df_cia.columns= df_cia.iloc[0,:]
df_cia = df_cia[1:]

### Renames columns, removes est. label from year and sets datatypes

In [205]:
df_cia = df_cia.rename(columns= {'Country': 'country', 'Date of Information': 'year'})
df_cia['year'] =df_cia['year'].str[:4]
df_cia['year'] = pd.to_datetime(df_cia['year'], format='%Y')
df_cia = df_cia.astype({'country': 'string', 'gini' : 'float64'})

In [206]:
df_cia

Unnamed: 0,country,gini,year
1,South Africa,63.0,2014
2,Namibia,59.1,2015
3,Zambia,57.1,2015
4,Central African Republic,56.2,2008
5,Eswatini,54.6,2016
...,...,...,...
173,Slovakia,25.2,2016
174,Belarus,24.4,2020
175,Slovenia,24.2,2017
176,Faroe Islands,22.7,2013


In [207]:
df_cia['iso3'] = wb.economy.coder(df_cia['country'])

In [208]:
df_cia = df_cia.dropna()
df_cia = df_cia.set_index(['iso3', 'year'])
df_cia

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ZAF,2014,South Africa,63.0
NAM,2015,Namibia,59.1
ZMB,2015,Zambia,57.1
CAF,2008,Central African Republic,56.2
SWZ,2016,Eswatini,54.6
...,...,...,...
ARM,2020,Armenia,25.2
SVK,2016,Slovakia,25.2
BLR,2020,Belarus,24.4
SVN,2017,Slovenia,24.2


In [209]:
df.isna().sum()

country       0
gnipc         3
gini       1525
dtype: int64

In [210]:
df['gini'] = df['gini'].fillna(df_cia['gini'])

In [211]:
df_na = df[df['gini'].isna()]

df_na = df_na.groupby(['iso3', 'country'])[['gnipc','gini']].mean()
df_na

Unnamed: 0_level_0,Unnamed: 1_level_0,gnipc,gini
iso3,country,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,Aruba,26340.000000,
AFG,Afghanistan,559.166667,
AGO,Angola,3080.909091,
ALB,Albania,4936.000000,
AND,Andorra,46530.000000,
...,...,...,...
XKX,Kosovo,4627.500000,
YEM,"Yemen, Rep.",1085.000000,
ZAF,South Africa,6827.000000,
ZMB,Zambia,1409.000000,


In [212]:
df_na.value_counts()

Series([], dtype: int64)

In [213]:
df.isna().sum()

country       0
gnipc         3
gini       1516
dtype: int64

In [214]:
df_na = df[df['gini'].isna()]

In [215]:
df_na.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,country,gnipc,gini
iso3,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZWE,2021,Zimbabwe,1530.0,
ZWE,2020,Zimbabwe,1460.0,
ZWE,2018,Zimbabwe,1550.0,
ZWE,2016,Zimbabwe,1200.0,
ZWE,2015,Zimbabwe,1220.0,


In [216]:
df = df.dropna()
df = df.astype({'gnipc' : 'float64', 'gini' : 'float64'})
df.dtypes

country     object
gnipc      float64
gini       float64
dtype: object

In [217]:
world_median_gnipc = df['gnipc'].median()
world_median_gnipc

9425.0

In [218]:
world_median_gini = df['gini'].median()
world_median_gini

35.2

In [219]:
df['income_level'] = np.where(df['gnipc'] > world_median_gnipc, 'high', 'low')

In [220]:
df['inequality_level'] = np.where(df['gini'] > world_median_gini, 'high', 'low')

In [221]:
df.isna().sum()

country             0
gnipc               0
gini                0
income_level        0
inequality_level    0
dtype: int64