In [1]:
#Imports and files
import pandas as pd
import numpy as np
import seaborn as sb
import statsmodels.api as sm

educationData = pd.read_csv('Data/Processed/Education_Data.csv')
gdpData = pd.read_csv('Data/Processed/GDP_Data.csv')
combinedData = pd.read_csv('Data/Processed/Combined_Data.csv')

In [2]:
#Initial GDP Dataset insights
gdpGrowth = gdpData.sort_values(by=['Percentage growth from 2021 - 2023'], ascending = False)
growthPerCapita = gdpData.sort_values(by=['Per Capita Percentage growth from 2021 - 2023'], ascending = False)

print('The countries experiencing the most nominal GDP growth are: \n')
print(gdpGrowth[['Country', 'Percentage growth from 2021 - 2023']].head())

print('\n\nThe countries experiencing the highest per capita growth are: \n')
print(growthPerCapita[['Country', 'Per Capita Percentage growth from 2021 - 2023']].head())


The countries experiencing the most nominal GDP growth are: 

        Country  Percentage growth from 2021 - 2023
126      Guyana                           50.737951
151     Liberia                           43.754313
111     Armenia                           43.516707
163  Seychelles                           38.273381
105     Georgia                           37.727742


The countries experiencing the highest per capita growth are: 

        Country  Per Capita Percentage growth from 2021 - 2023
126      Guyana                                      51.378556
58     Ethiopia                                      43.991853
163  Seychelles                                      42.149354
151     Liberia                                      41.125000
147    Maldives                                      40.964747


In [3]:
#Statistical Measures

#2023
statsGDPpc = gdpData['2023 GDP per Capita (USD)']
statsGDP = gdpData['2023 Nominal GDP (USD Million)']

minGDPpc = round(min(statsGDPpc), 2)
maxGDPpc = round(max(statsGDPpc), 2)
meanGDPpc = round(sum(statsGDPpc)/len(statsGDPpc), 2)
medianGDPpc = round(np.median(statsGDPpc), 2)
stdevGDPpc = round(np.std(statsGDPpc), 2)

minGDP = round(min(statsGDP), 2)
maxGDP = round(max(statsGDP), 2)
meanGDP = round(sum(statsGDP)/len(statsGDP), 2)
medianGDP = round(np.median(statsGDP), 2)
stdevGDP = round(np.std(statsGDP), 2)

#2022
statsGDPpc2 = gdpData['2022 GDP per Capita (USD)']
statsGDP2 = gdpData['2022 Nominal GDP (USD Million)']

minGDPpc2 = round(min(statsGDPpc2), 2)
maxGDPpc2 = round(max(statsGDPpc2), 2)
meanGDPpc2 = round(sum(statsGDPpc2)/len(statsGDPpc2), 2)
medianGDPpc2 = round(np.median(statsGDPpc2), 2)
stdevGDPpc2 = round(np.std(statsGDPpc2), 2)

minGDP2 = round(min(statsGDP2), 2)
maxGDP2 = round(max(statsGDP2), 2)
meanGDP2 = round(sum(statsGDP2)/len(statsGDP2), 2)
medianGDP2 = round(np.median(statsGDP2), 2)
stdevGDP2= round(np.std(statsGDP2), 2)

#2021

statsGDPpc1 = gdpData['2021 GDP per Capita (USD)']
statsGDP1 = gdpData['2021 Nominal GDP (USD Million)']

minGDPpc1 = round(min(statsGDPpc1), 2)
maxGDPpc1 = round(max(statsGDPpc1), 2)
meanGDPpc1 = round(sum(statsGDPpc1)/len(statsGDPpc1), 2)
medianGDPpc1 = round(np.median(statsGDPpc1), 2)
stdevGDPpc1 = round(np.std(statsGDPpc1), 2)

minGDP1 = round(min(statsGDP1), 2)
maxGDP1 = round(max(statsGDP1), 2)
meanGDP1 = round(sum(statsGDP1)/len(statsGDP1), 2)
medianGDP1 = round(np.median(statsGDP1), 2)
stdevGDP1 = round(np.std(statsGDP1), 2)

summary = pd.DataFrame({'Measure':['Min', 
                     'Max', 
                     'Mean', 
                     'Median',
                     'Standard Deviation'],
           
                     
           '2023 GDP (USD Million)':[minGDP, maxGDP, meanGDP, medianGDP, stdevGDP],
           '2022 GDP (USD Million)':[minGDP2, maxGDP2, meanGDP2, medianGDP2, stdevGDP2],  
           '2021 GDP (USD Million)':[minGDP1, maxGDP1, meanGDP1, medianGDP1, stdevGDP1],
                       
           '2023 GDP per capita (USD)':[minGDPpc, maxGDPpc, meanGDPpc, medianGDPpc, stdevGDPpc],
           '2022 GDP per capita (USD)':[minGDPpc2, maxGDPpc2, meanGDPpc2, medianGDPpc2, stdevGDPpc2],
           '2021 GDP per capita (USD)':[minGDPpc1, maxGDPpc1, meanGDPpc1, medianGDPpc1, stdevGDPpc1]})
summary


Unnamed: 0,Measure,2023 GDP (USD Million),2022 GDP (USD Million),2021 GDP (USD Million),2023 GDP per capita (USD),2022 GDP per capita (USD),2021 GDP per capita (USD)
0,Min,63.0,60.0,60.0,246.0,238.0,311.0
1,Max,26949643.0,25462700.0,23315081.0,135605.0,126426.0,133745.0
2,Mean,574725.52,549899.84,529850.74,17922.42,16902.78,16104.88
3,Median,46732.0,44975.0,40433.0,7220.0,6790.0,6312.5
4,Standard Deviation,2465800.68,2378780.62,2252764.9,23858.63,22845.67,22140.76


In [4]:
meanGDPgrowth = sum(gdpGrowth['Percentage growth from 2021 - 2023'])/len(gdpGrowth['Percentage growth from 2021 - 2023'])
meanPercapitaGrowth = sum(gdpGrowth['Per Capita Percentage growth from 2021 - 2023'])/len(gdpGrowth['Per Capita Percentage growth from 2021 - 2023'])

print(f"The average GDP growth between 2021 and 2023 is: {round(meanGDPgrowth, 2)}%")
print(f"The average per capita growth between 2021 and 2023 is: {round(meanPercapitaGrowth, 2)}%")


The average GDP growth between 2021 and 2023 is: 11.97%
The average per capita growth between 2021 and 2023 is: 10.87%


In [5]:
#Initial Education Dataset insights
primaryEd = educationData.sort_values(by=['Gross_Primary_Education_Enrollment'], ascending = False)
tertiaryEd = educationData.sort_values(by=['Gross_Tertiary_Education_Enrollment'], ascending = False)
unenmploymentEd = educationData.sort_values(by=['Unemployment_Rate'], ascending = True)
unenmploymentEd = unenmploymentEd[unenmploymentEd['Unemployment_Rate'] > 0]

print('The countries with the highest gross elementary enrollment are: \n')
print(primaryEd[['Country', 'Gross_Primary_Education_Enrollment']].head())

print('The countries with the highest gross university / college enrollment are: \n')
print(tertiaryEd[['Country', 'Gross_Tertiary_Education_Enrollment']].head())

print('\n \n The countries with the lowest unemployment rates (Assuming 0 means missing input) are : \n')
print(unenmploymentEd[['Country', 'Unemployment_Rate']].head())

The countries with the highest gross elementary enrollment are: 

        Country  Gross_Primary_Education_Enrollment
106      Malawi                               142.5
105  Madagascar                               142.5
125       Nepal                               142.1
65        Gabon                               139.9
149      Rwanda                               133.0
The countries with the highest gross university / college enrollment are: 

         Country  Gross_Tertiary_Education_Enrollment
70        Greece                                136.6
9      Australia                                113.1
71       Grenada                                104.6
145  South Korea                                 94.3
7      Argentina                                 90.0

 
 The countries with the lowest unemployment rates (Assuming 0 means missing input) are : 

             Country  Unemployment_Rate
144            Qatar               0.09
129            Niger               0.47
164  Sol

In [6]:
#Check for Correlation 
combinedData.corr()

Unnamed: 0,2023 GDP per Capita (USD),2022 Nominal GDP (USD Million),Percentage growth from 2021 - 2023,Per Capita Percentage growth from 2021 - 2023,Birth_Rate,Gross_Primary_Education_Enrollment,Average Youth Literacy Rate,Gross_Tertiary_Education_Enrollment,Unemployment_Rate
2023 GDP per Capita (USD),1.0,0.246438,-0.027962,-0.034159,-0.523487,0.047229,-0.199374,0.522713,-0.107432
2022 Nominal GDP (USD Million),0.246438,1.0,-0.072963,-0.053224,-0.163789,0.027444,0.008594,0.218527,0.064914
Percentage growth from 2021 - 2023,-0.027962,-0.072963,1.0,0.929966,-0.03268,0.060187,0.056221,-0.036328,0.014038
Per Capita Percentage growth from 2021 - 2023,-0.034159,-0.053224,0.929966,1.0,-0.122719,0.019938,0.035464,0.015731,0.054547
Birth_Rate,-0.523487,-0.163789,-0.03268,-0.122719,1.0,0.205772,0.056037,-0.615184,0.002876
Gross_Primary_Education_Enrollment,0.047229,0.027444,0.060187,0.019938,0.205772,1.0,0.130384,0.141937,0.071223
Average Youth Literacy Rate,-0.199374,0.008594,0.056221,0.035464,0.056037,0.130384,1.0,0.025465,0.008774
Gross_Tertiary_Education_Enrollment,0.522713,0.218527,-0.036328,0.015731,-0.615184,0.141937,0.025465,1.0,0.094536
Unemployment_Rate,-0.107432,0.064914,0.014038,0.054547,0.002876,0.071223,0.008774,0.094536,1.0


In [7]:
#Regression to see if variables can predict GDP
yGDP = combinedData['2022 Nominal GDP (USD Million)']
X = combinedData[['Gross_Primary_Education_Enrollment', 
                  'Unemployment_Rate', 
                  'Birth_Rate',
                  'Gross_Tertiary_Education_Enrollment',
                  'Average Youth Literacy Rate']]

model = sm.OLS(yGDP, X).fit()
model.summary()

#They cannot.

0,1,2,3
Dep. Variable:,2022 Nominal GDP (USD Million),R-squared (uncentered):,0.102
Model:,OLS,Adj. R-squared (uncentered):,0.074
Method:,Least Squares,F-statistic:,3.688
Date:,"Fri, 08 Dec 2023",Prob (F-statistic):,0.00346
Time:,13:05:19,Log-Likelihood:,-2690.4
No. Observations:,167,AIC:,5391.0
Df Residuals:,162,BIC:,5406.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gross_Primary_Education_Enrollment,1361.1446,7167.552,0.190,0.850,-1.28e+04,1.55e+04
Unemployment_Rate,2.22e+04,3.53e+04,0.628,0.531,-4.76e+04,9.2e+04
Birth_Rate,-1.352e+04,2.38e+04,-0.568,0.571,-6.05e+04,3.35e+04
Gross_Tertiary_Education_Enrollment,1.46e+04,8089.501,1.805,0.073,-1374.966,3.06e+04
Average Youth Literacy Rate,304.1683,4162.374,0.073,0.942,-7915.338,8523.675

0,1,2,3
Omnibus:,283.334,Durbin-Watson:,0.337
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36807.343
Skew:,8.061,Prob(JB):,0.0
Kurtosis:,73.921,Cond. No.,22.4


In [8]:
#Regression to see if variables can predict GDP per Capita
yPC = combinedData['2023 GDP per Capita (USD)']


model = sm.OLS(yPC, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,2023 GDP per Capita (USD),R-squared (uncentered):,0.604
Model:,OLS,Adj. R-squared (uncentered):,0.592
Method:,Least Squares,F-statistic:,49.5
Date:,"Fri, 08 Dec 2023",Prob (F-statistic):,6.37e-31
Time:,13:05:19,Log-Likelihood:,-1874.9
No. Observations:,167,AIC:,3760.0
Df Residuals:,162,BIC:,3775.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gross_Primary_Education_Enrollment,238.5148,54.256,4.396,0.000,131.375,345.655
Unemployment_Rate,-516.0562,267.555,-1.929,0.056,-1044.401,12.288
Birth_Rate,-588.1663,180.262,-3.263,0.001,-944.132,-232.200
Gross_Tertiary_Education_Enrollment,295.8552,61.235,4.831,0.000,174.934,416.776
Average Youth Literacy Rate,-99.3324,31.508,-3.153,0.002,-161.551,-37.114

0,1,2,3
Omnibus:,108.436,Durbin-Watson:,2.08
Prob(Omnibus):,0.0,Jarque-Bera (JB):,843.632
Skew:,2.327,Prob(JB):,6.42e-184
Kurtosis:,12.979,Cond. No.,22.4


In [9]:
#Regression to see if variables can predict GDP growth
yGDPgrowth = combinedData['Percentage growth from 2021 - 2023']


model = sm.OLS(yGDPgrowth, X).fit()
model.summary()

#Although it is a weak model, Gross_Primary_Education_Enrollment is the only statistically significant variable


0,1,2,3
Dep. Variable:,Percentage growth from 2021 - 2023,R-squared (uncentered):,0.394
Model:,OLS,Adj. R-squared (uncentered):,0.375
Method:,Least Squares,F-statistic:,21.06
Date:,"Fri, 08 Dec 2023",Prob (F-statistic):,3.46e-16
Time:,13:05:19,Log-Likelihood:,-685.49
No. Observations:,167,AIC:,1381.0
Df Residuals:,162,BIC:,1397.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gross_Primary_Education_Enrollment,0.1370,0.044,3.130,0.002,0.051,0.224
Unemployment_Rate,0.1109,0.216,0.513,0.608,-0.316,0.537
Birth_Rate,-0.1101,0.145,-0.757,0.450,-0.397,0.177
Gross_Tertiary_Education_Enrollment,-0.0402,0.049,-0.814,0.417,-0.138,0.057
Average Youth Literacy Rate,0.0186,0.025,0.732,0.466,-0.032,0.069

0,1,2,3
Omnibus:,50.967,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,177.923
Skew:,-1.139,Prob(JB):,2.3099999999999998e-39
Kurtosis:,7.514,Cond. No.,22.4


In [10]:
#Regression to see if variables can predict per Capita growth
yPCgrowth = combinedData['Per Capita Percentage growth from 2021 - 2023']

model = sm.OLS(yPCgrowth, X).fit()
model.summary()

#Again, it is a weak model, while Gross_Primary_Education_Enrollment is the only 
#statistically significant variable

0,1,2,3
Dep. Variable:,Per Capita Percentage growth from 2021 - 2023,R-squared (uncentered):,0.329
Model:,OLS,Adj. R-squared (uncentered):,0.308
Method:,Least Squares,F-statistic:,15.87
Date:,"Fri, 08 Dec 2023",Prob (F-statistic):,1.05e-12
Time:,13:05:19,Log-Likelihood:,-692.1
No. Observations:,167,AIC:,1394.0
Df Residuals:,162,BIC:,1410.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gross_Primary_Education_Enrollment,0.1377,0.046,3.023,0.003,0.048,0.228
Unemployment_Rate,0.2474,0.225,1.101,0.272,-0.196,0.691
Birth_Rate,-0.2254,0.151,-1.489,0.138,-0.524,0.073
Gross_Tertiary_Education_Enrollment,-0.0371,0.051,-0.722,0.471,-0.139,0.064
Average Youth Literacy Rate,0.0152,0.026,0.574,0.567,-0.037,0.067

0,1,2,3
Omnibus:,37.967,Durbin-Watson:,1.879
Prob(Omnibus):,0.0,Jarque-Bera (JB):,119.611
Skew:,-0.853,Prob(JB):,1.06e-26
Kurtosis:,6.779,Cond. No.,22.4


In [11]:
#The best model for prediction was on GDP per capita, let's attempt to improve it further.
#We remove the variable with the highest P-value, (Least significant). This method is called backward selection

newX = combinedData[['Gross_Primary_Education_Enrollment', 
                  'Birth_Rate',
                  'Gross_Tertiary_Education_Enrollment',
                  'Average Youth Literacy Rate']]

model = sm.OLS(yPC, newX).fit()
model.summary()

#All variables in this model are statistically significant. This model is able to account for 59.5% of the variation
#in GDP per capita, using the four variables.

0,1,2,3
Dep. Variable:,2023 GDP per Capita (USD),R-squared (uncentered):,0.595
Model:,OLS,Adj. R-squared (uncentered):,0.585
Method:,Least Squares,F-statistic:,59.94
Date:,"Fri, 08 Dec 2023",Prob (F-statistic):,4.73e-31
Time:,13:05:19,Log-Likelihood:,-1876.8
No. Observations:,167,AIC:,3762.0
Df Residuals:,163,BIC:,3774.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gross_Primary_Education_Enrollment,221.1409,53.947,4.099,0.000,114.615,327.667
Birth_Rate,-629.3092,180.483,-3.487,0.001,-985.695,-272.924
Gross_Tertiary_Education_Enrollment,277.2858,60.976,4.547,0.000,156.882,397.690
Average Youth Literacy Rate,-99.3007,31.770,-3.126,0.002,-162.034,-36.568

0,1,2,3
Omnibus:,107.56,Durbin-Watson:,2.115
Prob(Omnibus):,0.0,Jarque-Bera (JB):,800.965
Skew:,2.323,Prob(JB):,1.1799999999999999e-174
Kurtosis:,12.67,Cond. No.,15.7
