# Analyzing GDP per capita, Share of agricultural employment, and education.
Using data from WDI and ILOSTAT.

In [119]:
## Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [120]:
## Import data
data_edu = pd.read_csv('Data\EAP_TEAP_SEX_EDU_NB_A-filtered-2024-03-03.csv')
data_WB = pd.read_excel('Data\P_Data_Extract_From_World_Development_Indicators.xlsx')

In [121]:
data_edu

Unnamed: 0,ref_area.label,classif1.label,time,obs_value
0,Afghanistan,Education (Aggregate levels): Total,2021,8154.806
1,Afghanistan,Education (Aggregate levels): Intermediate,2021,1302.836
2,Afghanistan,Education (Aggregate levels): Advanced,2021,506.250
3,Afghanistan,Education (Aggregate levels): Total,2020,6884.703
4,Afghanistan,Education (Aggregate levels): Intermediate,2020,1003.672
...,...,...,...,...
5310,Zimbabwe,Education (Aggregate levels): Intermediate,2014,632.201
5311,Zimbabwe,Education (Aggregate levels): Advanced,2014,138.825
5312,Zimbabwe,Education (Aggregate levels): Total,2011,5739.105
5313,Zimbabwe,Education (Aggregate levels): Intermediate,2011,115.637


In [122]:
data_WB

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],...,2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022]
0,Afghanistan,AFG,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,65.277464,65.354736,65.347961,65.254189,63.931615,63.256091,...,47.697315,44.798594,44.593516,44.337137,43.989031,44.4536,45.01604,45.983408,46.587823,46.58907
1,Afghanistan,AFG,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,..,..,..,..,1280.463171,1292.333437,...,2165.340915,2144.449634,2108.714173,2101.422187,2096.093111,2060.698973,2079.921861,1968.341002,1516.273265,..
2,Albania,ALB,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,51.319016,50.692993,49.91959,48.988817,48.143328,47.995341,...,44.198027,42.257063,41.283525,40.040852,38.078346,37.285732,36.416856,36.190744,35.640848,34.926718
3,Albania,ALB,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,4819.091331,5474.87661,5892.610924,6441.472108,6753.914123,7154.030284,...,11361.307891,11586.873945,11878.495523,12291.901997,12771.054137,13317.1842,13653.248783,13278.434516,14596.015558,15492.067404
4,Algeria,DZA,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,22.690649,22.475059,22.235421,21.715431,21.707295,21.722026,...,10.753295,9.746314,8.834767,8.535155,10.16129,10.092127,9.798442,10.023791,10.033098,9.74323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,,,,,,,,,,,...,,,,,,,,,,
435,,,,,,,,,,,...,,,,,,,,,,
436,,,,,,,,,,,...,,,,,,,,,,
437,Data from database: World Development Indicators,,,,,,,,,,...,,,,,,,,,,


## Cleaning and merging data

In [123]:
### For education data. Compute the proportion of the population with at least an upper secondary education.
## First create total labor force. (from 'obs_value' of "classif1.label" = 'Education (ISCED-11): Total')
Total_LF = data_edu[data_edu["classif1.label"] == 'Education (Aggregate levels): Total']
## Then create the labor force with at least an upper secondary education. (from 'obs_value' of "classif1.label" != 'Education (ISCED-11): Total')
Upper_Sec_LF = data_edu[data_edu["classif1.label"] != 'Education (Aggregate levels): Total'].groupby(['ref_area.label', 'time'])['obs_value'].sum().reset_index()

## Merge.
Final_EDU_DF = pd.merge(Total_LF, Upper_Sec_LF, on = ['ref_area.label', 'time'], how = 'left', suffixes = ('_Total', '_Upper_Sec'))

## Compute the proportion of the population with at least an upper secondary education.
Final_EDU_DF['Proportion_Upper_Sec'] = (Final_EDU_DF['obs_value_Upper_Sec'] / Final_EDU_DF['obs_value_Total']) * 100


In [124]:
Final_EDU_DF

Unnamed: 0,ref_area.label,classif1.label,time,obs_value_Total,obs_value_Upper_Sec,Proportion_Upper_Sec
0,Afghanistan,Education (Aggregate levels): Total,2021,8154.806,1809.086,22.184292
1,Afghanistan,Education (Aggregate levels): Total,2020,6884.703,1464.419,21.270620
2,Afghanistan,Education (Aggregate levels): Total,2017,7201.977,1461.605,20.294497
3,Afghanistan,Education (Aggregate levels): Total,2014,7604.930,1074.775,14.132609
4,Angola,Education (Aggregate levels): Total,2021,13192.842,2574.647,19.515484
...,...,...,...,...,...,...
1777,Zimbabwe,Education (Aggregate levels): Total,2022,6005.217,4056.064,67.542339
1778,Zimbabwe,Education (Aggregate levels): Total,2021,5869.093,3826.679,65.200517
1779,Zimbabwe,Education (Aggregate levels): Total,2019,5330.368,3365.382,63.136016
1780,Zimbabwe,Education (Aggregate levels): Total,2014,6579.326,771.026,11.718921


In [125]:
### Pivot the WB data. Have each Series as a column.
## First, create two new dataframes. One for each series.

## GDP
GDP_DF = data_WB[data_WB['Series Name'] == 'GDP per capita, PPP (constant 2017 international $)']

## Agricultural Employment
Agri_Emp_DF = data_WB[data_WB['Series Name'] == 'Employment in agriculture (% of total employment) (modeled ILO estimate)']

In [126]:
## Have each year as a row.
GDP_DF = GDP_DF.melt(id_vars = ['Country Name', 'Country Code', 'Series Name', 'Series Code'], var_name = 'Year', value_name = 'GDP')
Agri_Emp_DF = Agri_Emp_DF.melt(id_vars = ['Country Name', 'Country Code', 'Series Name', 'Series Code'], var_name = 'Year', value_name = 'Agri_Emp')

In [127]:
Agri_Emp_DF

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,Year,Agri_Emp
0,Afghanistan,AFG,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,1998 [YR1998],65.277464
1,Albania,ALB,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,1998 [YR1998],51.319016
2,Algeria,DZA,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,1998 [YR1998],22.690649
3,American Samoa,ASM,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,1998 [YR1998],..
4,Andorra,AND,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,1998 [YR1998],..
...,...,...,...,...,...,...
5420,Virgin Islands (U.S.),VIR,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,2022 [YR2022],1.293486
5421,West Bank and Gaza,PSE,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,2022 [YR2022],6.220994
5422,"Yemen, Rep.",YEM,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,2022 [YR2022],29.260987
5423,Zambia,ZMB,Employment in agriculture (% of total employme...,SL.AGR.EMPL.ZS,2022 [YR2022],57.314602


In [128]:
## Keep only the first 4 characters of the year.
GDP_DF['Year'] = GDP_DF['Year'].str[:4]
Agri_Emp_DF['Year'] = Agri_Emp_DF['Year'].str[:4]

## Convert the year to integer.
GDP_DF['Year'] = GDP_DF['Year'].astype(int)
Agri_Emp_DF['Year'] = Agri_Emp_DF['Year'].astype(int)

In [129]:
GDP_DF

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,Year,GDP
0,Afghanistan,AFG,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,1998,..
1,Albania,ALB,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,1998,4819.091331
2,Algeria,DZA,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,1998,8435.035658
3,American Samoa,ASM,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,1998,..
4,Andorra,AND,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,1998,..
...,...,...,...,...,...,...
5420,Virgin Islands (U.S.),VIR,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,2022,..
5421,West Bank and Gaza,PSE,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,2022,5722.409175
5422,"Yemen, Rep.",YEM,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,2022,..
5423,Zambia,ZMB,"GDP per capita, PPP (constant 2017 internation...",NY.GDP.PCAP.PP.KD,2022,3365.87378


In [130]:
### Merge all dataframes.
ALL_VAR_DF = pd.merge(GDP_DF[['Country Name', 'Year', 'GDP']], Final_EDU_DF[['ref_area.label', 'time', 'Proportion_Upper_Sec']], left_on = ['Country Name', 'Year'], right_on = ['ref_area.label', 'time'], how = 'outer')
ALL_VAR_DF = pd.merge(ALL_VAR_DF, Agri_Emp_DF[['Country Name', 'Year', 'Agri_Emp']], left_on = ['Country Name', 'Year'], right_on = ['Country Name', 'Year'], how = 'outer')

In [144]:
## Convert GDP, Proportion_Upper_Sec and Agri_Emp to numeric.
ALL_VAR_DF['GDP'] = pd.to_numeric(ALL_VAR_DF['GDP'], errors = 'coerce')
ALL_VAR_DF['Proportion_Upper_Sec'] = pd.to_numeric(ALL_VAR_DF['Proportion_Upper_Sec'], errors = 'coerce')
ALL_VAR_DF['Agri_Emp'] = pd.to_numeric(ALL_VAR_DF['Agri_Emp'], errors = 'coerce')

In [145]:
ALL_VAR_DF[ALL_VAR_DF['Country Name'] == 'Thailand']

Unnamed: 0,Country Name,Year,GDP,ref_area.label,time,Proportion_Upper_Sec,Agri_Emp
192,Thailand,1998.0,9156.214446,,,,54.070747
409,Thailand,1999.0,9467.921239,,,,51.692419
626,Thailand,2000.0,9791.895971,Thailand,2000.0,12.623944,51.663772
843,Thailand,2001.0,10036.329469,,,,47.581161
1060,Thailand,2002.0,10558.467491,,,,46.131496
1277,Thailand,2003.0,11220.683301,,,,44.902624
1494,Thailand,2004.0,11828.839043,,,,44.330282
1711,Thailand,2005.0,12228.660905,,,,43.854896
1928,Thailand,2006.0,12739.737716,,,,43.388974
2145,Thailand,2007.0,13330.208876,Thailand,2007.0,24.186184,42.855782


In [146]:
### Create Dataframe containing countries which had similar level of Agri_Emp as Thailand in 1998.
Thai_Agri_Emp_1998 = ALL_VAR_DF[(ALL_VAR_DF['Year'] == 1998) & (ALL_VAR_DF['Country Name'] == 'Thailand')]['Agri_Emp'].values[0]
range_agri = [Thai_Agri_Emp_1998 - 5, Thai_Agri_Emp_1998 + 5]

## Get names of countries with similar level of Agri_Emp as Thailand in 1998.
Similar_Agri_Emp_Countries = ALL_VAR_DF[(ALL_VAR_DF['Year'] == 1998) & (ALL_VAR_DF['Agri_Emp'] >= range_agri[0]) & (ALL_VAR_DF['Agri_Emp'] <= range_agri[1])]['Country Name'].unique()

In [147]:
Similar_Agri_Emp_Countries

array(['Albania', 'Armenia', 'Benin', 'China', 'Comoros', "Cote d'Ivoire",
       'Ghana', 'Haiti', "Korea, Dem. People's Rep.", 'Liberia',
       'Mongolia', 'Nigeria', 'Senegal', 'Solomon Islands', 'Sudan',
       'Thailand', 'Togo', 'Yemen, Rep.'], dtype=object)