# Classification Models: 
### Looking to Classify the Financial Impact on Countries that take in Refugees
---
Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import plot_confusion_matrix, mean_squared_error, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [3]:
df = pd.read_csv('../../data/data_final.csv')

In [4]:
df.head()

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,GDP_annual_change,Adjusted savings: net national savings (current US$),Adjusted savings: particulate emission damage (current US$),"Adolescent fertility rate (births per 1,000 women ages 15-19)","Air transport, passengers carried",Current health expenditure (% of GDP),Current health expenditure per capita (current US$),"Death rate, crude (per 1,000 people)",Domestic general government health expenditure per capita (current US$),Domestic private health expenditure per capita (current US$),Ease of doing business score (0 = lowest performance to 100 = best performance),Fixed broadband subscriptions (per 100 people),Fixed telephone subscriptions (per 100 people),GNI growth (annual %),"International tourism, expenditures (current US$)","International tourism, receipts (current US$)",Military expenditure (current USD),Population growth (annual %),Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Strength of legal rights index (0=weak to 12=strong),"Unemployment, total (% of total labor force) (modeled ILO estimate)",Net official flows from UN agencies: Total
0,2016,El Salvador,41.0,0.0,0.0,0.0,9800.0,41.0,9841.0,2.545926,496536900.0,70449430.0,70.2702,2603129.0,7.705304,293.258026,6.984,189.06351,97.595212,62.65253,6.219291,14.807471,1.962239,451000000.0,1161000000.0,247600000.0,0.489164,9.9,41.0,9.0,4.42,6561136.0
1,2018,El Salvador,44.0,16.0,71501.0,0.0,4700.0,60.0,76261.0,2.432288,549565600.0,69464900.0,68.6136,2545105.0,7.093071,288.517456,7.04,184.221643,103.707697,64.92118,7.666788,14.375728,2.420301,490000000.0,1370000000.0,294610000.0,0.509273,8.6,44.0,9.0,4.01,6498476.0
2,2016,Mexico,6178.0,2636.0,0.0,13.0,0.0,8814.0,8827.0,2.630532,57003370000.0,2093191000.0,61.486,53313307.0,5.553459,485.624176,5.851,249.026695,236.597456,72.37307,12.911323,16.881134,2.526165,12823000000.0,20619000000.0,5336876000.0,1.20326,5.8,6178.0,10.0,3.86,4939827.0
3,2016,Colombia,241.0,368.0,7410816.0,11.0,0.0,609.0,7411436.0,2.087383,14952740000.0,458137500.0,67.9844,32262658.0,7.53117,442.138245,5.453,310.953585,131.18467,69.1652,12.273921,14.771098,2.422212,4891000000.0,5631000000.0,8675981000.0,1.36765,6.8,241.0,11.0,8.69,12837930.0
4,2018,Colombia,294.0,2851.0,7816473.0,11.0,0.0,3145.0,7819629.0,2.564324,16207040000.0,539219600.0,65.482,33704037.0,7.625414,513.158691,5.562,367.495799,145.662881,69.24335,13.448252,14.04234,1.748331,5531000000.0,6655000000.0,10134720000.0,1.524236,7.9,294.0,11.0,9.11,27781000.0


Will be changing the GDP Annual Change to a classification problem by saying that anything greater than 0 will be 1 and anything 0 and below will be 0.

In [7]:
df['GDP_annual_change_class'] = np.where(df['GDP_annual_change'] >= 0, 1, 0)


In [9]:
df.head(30)

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,GDP_annual_change,Adjusted savings: net national savings (current US$),Adjusted savings: particulate emission damage (current US$),"Adolescent fertility rate (births per 1,000 women ages 15-19)","Air transport, passengers carried",Current health expenditure (% of GDP),Current health expenditure per capita (current US$),"Death rate, crude (per 1,000 people)",Domestic general government health expenditure per capita (current US$),Domestic private health expenditure per capita (current US$),Ease of doing business score (0 = lowest performance to 100 = best performance),Fixed broadband subscriptions (per 100 people),Fixed telephone subscriptions (per 100 people),GNI growth (annual %),"International tourism, expenditures (current US$)","International tourism, receipts (current US$)",Military expenditure (current USD),Population growth (annual %),Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Strength of legal rights index (0=weak to 12=strong),"Unemployment, total (% of total labor force) (modeled ILO estimate)",Net official flows from UN agencies: Total,GDP_annual_change_class
0,2016,El Salvador,41.0,0.0,0.0,0.0,9800.0,41.0,9841.0,2.545926,496536900.0,70449430.0,70.2702,2603129.0,7.705304,293.258026,6.984,189.06351,97.595212,62.65253,6.219291,14.807471,1.962239,451000000.0,1161000000.0,247600000.0,0.489164,9.9,41.0,9.0,4.42,6561136.0,1
1,2018,El Salvador,44.0,16.0,71501.0,0.0,4700.0,60.0,76261.0,2.432288,549565600.0,69464900.0,68.6136,2545105.0,7.093071,288.517456,7.04,184.221643,103.707697,64.92118,7.666788,14.375728,2.420301,490000000.0,1370000000.0,294610000.0,0.509273,8.6,44.0,9.0,4.01,6498476.0,1
2,2016,Mexico,6178.0,2636.0,0.0,13.0,0.0,8814.0,8827.0,2.630532,57003370000.0,2093191000.0,61.486,53313310.0,5.553459,485.624176,5.851,249.026695,236.597456,72.37307,12.911323,16.881134,2.526165,12823000000.0,20619000000.0,5336876000.0,1.20326,5.8,6178.0,10.0,3.86,4939827.0,1
3,2016,Colombia,241.0,368.0,7410816.0,11.0,0.0,609.0,7411436.0,2.087383,14952740000.0,458137500.0,67.9844,32262660.0,7.53117,442.138245,5.453,310.953585,131.18467,69.1652,12.273921,14.771098,2.422212,4891000000.0,5631000000.0,8675981000.0,1.36765,6.8,241.0,11.0,8.69,12837930.0,1
4,2018,Colombia,294.0,2851.0,7816473.0,11.0,0.0,3145.0,7819629.0,2.564324,16207040000.0,539219600.0,65.482,33704040.0,7.625414,513.158691,5.562,367.495799,145.662881,69.24335,13.448252,14.04234,1.748331,5531000000.0,6655000000.0,10134720000.0,1.524236,7.9,294.0,11.0,9.11,27781000.0,1
5,2016,Costa Rica,4162.0,3626.0,0.0,127.0,0.0,7788.0,7915.0,4.204323,5725282000.0,60474640.0,54.7214,1572605.0,7.317244,878.888916,4.933,644.435555,234.204063,67.67937,13.044336,17.111389,4.232168,1055000000.0,3776000000.0,0.0,1.057366,3.6,4162.0,10.0,8.6,2792769.0,1
6,2018,Costa Rica,4531.0,32595.0,0.0,82.0,40.0,37126.0,37248.0,2.615904,5965686000.0,64304720.0,52.5186,1950049.0,7.295722,909.673096,5.078,658.548246,251.068277,68.83187,16.697547,15.358417,2.460276,1198000000.0,4073000000.0,0.0,0.994802,3.0,4531.0,10.0,9.63,3812588.0,1
7,2017,El Salvador,40.0,5.0,71500.0,0.0,4200.0,45.0,75745.0,2.24667,663978400.0,66641130.0,69.459,2670560.0,7.212803,282.036743,7.011,179.786046,101.852181,64.70983,7.06409,14.53373,1.847512,466000000.0,1227000000.0,260900000.0,0.501984,9.1,40.0,9.0,4.39,2012717.0,1
8,2016,Bangladesh,276203.0,0.0,0.0,0.0,0.0,276203.0,276203.0,7.113478,64032110000.0,1962078000.0,84.0906,3815869.0,2.472912,34.568913,5.539,5.676981,26.188496,41.01625,4.173006,0.484996,6.408403,843000000.0,214300000.0,3239754000.0,1.095288,13.3,276203.0,5.0,4.35,89593210.0,1
9,2017,Colombia,260.0,525.0,7677609.0,11.0,68734.0,785.0,7747139.0,1.359361,15503000000.0,507804500.0,66.651,32504900.0,7.678643,489.644592,5.503,347.848228,141.796375,69.03524,12.944291,14.286806,0.566624,5136000000.0,5899000000.0,10018030000.0,1.513747,7.1,260.0,11.0,8.87,9782999.0,1


## Null Model
---

In [10]:
features = ["Refugees under UNHCR's mandate", 'Asylum-seekers',
       'IDPs of concern to UNHCR', 'Stateless persons', 'Others of concern',
       'Ref and Asyl', 'SUM REFUGEE',
       'Adjusted savings: net national savings (current US$)',
       'Adjusted savings: particulate emission damage (current US$)',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Air transport, passengers carried',
       'Current health expenditure (% of GDP)',
       'Current health expenditure per capita (current US$)',
       'Death rate, crude (per 1,000 people)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure per capita (current US$)',
       'Ease of doing business score (0 = lowest performance to 100 = best performance)',
       'Fixed broadband subscriptions (per 100 people)',
       'Fixed telephone subscriptions (per 100 people)',
       'GNI growth (annual %)',
       'International tourism, expenditures (current US$)',
       'International tourism, receipts (current US$)',
       'Military expenditure (current USD)', 'Population growth (annual %)',
       'Prevalence of undernourishment (% of population)',
       'Refugee population by country or territory of asylum',
       'Strength of legal rights index (0=weak to 12=strong)',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)',
       'Net official flows from UN agencies: Total']

In [11]:
x = df[features]
y = df['GDP_annual_change_class']

In [13]:
y.value_counts(normalize = True)

1    0.932849
0    0.067151
Name: GDP_annual_change_class, dtype: float64

## Classification Pipelines
---

In [None]:

def modelfunc(X, y):
    pipelines = [
        ('LOGREG', (Pipeline([ ('LR', LogisticRegression(max_iter=1000, random_state=42))]))),
    ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeClassifier())]))),
    ('BAGGED TREE', (Pipeline([ ('BAG', BaggingClassifier())]))),
    ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestClassifier())]))),
    ('ADABOOST', (Pipeline([ ('ADA', AdaBoostClassifier())]))),
    ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsClassifier())]))),
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    for pipe_name ,model in pipelines:
        print(pipe_name)
        model.fit(Z_train, y_train)
        trainscore = model.score(Z_train, y_train)
        testscore = model.score(Z_test, y_test)
        crossval = cross_val_score(model, Z_train, y_train).mean()
        y_pred_train = model.predict(Z_train)
        y_pred_test = model.predict(Z_test)

        f1_train = metrics.f1_score(y_train, y_pred_train)
        f1_test = metrics.f1_score(y_test, y_pred_test)

        #cf_matrix = plot_confusion_matrix(estimator = model, X = X_test, y_true = y_test)
        cm = confusion_matrix(y_test, y_pred_test, labels=model.classes_)
        disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=model.classes_)
        plt.title(pipe_name)
        disp.plot()

        print(f'Train: {trainscore}, Test: {testscore}, CV: {crossval}')
        print (f'f1 - Train = {f1_train}')
        print (f'f1 - Test = {f1_test}')
        print (' ')
        print (' ')

## Stacking
---

In [None]:
level1_models = [
    ('GRAD', GradientBoostingClassifier()),
    ('ADA', AdaBoostClassifier()),
    ('LR', LinearRegression())
]


stack = StackingClassifier(estimators = level1_models, final_estimator = LinearRegression())


stack.fit(X_train, y_train)
trainscore = stack.score(X_train, y_train)
testscore = stack.score(X_test, y_test)
crossval = cross_val_score(stack, X_train, y_train).mean()
y_pred_train = stack.predict(X_train)
y_pred_test = stack.predict(X_test)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
mse_test = metrics.mean_absolute_error(y_test, y_pred_test)


print(f'Train: {trainscore}, Test: {testscore}, CV: {crossval}')
print (f'Mean Squared Error - Train = {mse_train}')
print (f'Mean Squared Error - Test = {mse_test}')