# Regression Models: 
### Predicting GDP Annual Change.
---
Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

Read in Data

In [3]:
df = pd.read_csv('../../data/data_final.csv')

Set features & X and y

In [5]:
df.columns

Index(['Year', 'Country', 'Refugees under UNHCR's mandate', 'Asylum-seekers',
       'IDPs of concern to UNHCR', 'Stateless persons', 'Others of concern',
       'Ref and Asyl', 'SUM REFUGEE', 'GDP_annual_change',
       'Adjusted savings: net national savings (current US$)',
       'Adjusted savings: particulate emission damage (current US$)',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Air transport, passengers carried',
       'Current health expenditure (% of GDP)',
       'Current health expenditure per capita (current US$)',
       'Death rate, crude (per 1,000 people)',
       'Domestic general government health expenditure per capita (current US$)',
       'Domestic private health expenditure per capita (current US$)',
       'Ease of doing business score (0 = lowest performance to 100 = best performance)',
       'Fixed broadband subscriptions (per 100 people)',
       'Fixed telephone subscriptions (per 100 people)',
       'GNI growth (ann

In [44]:
features = ["Refugees under UNHCR's mandate", 
            'Asylum-seekers',
            'IDPs of concern to UNHCR', 
            'Stateless persons', 
            'Others of concern',
            'Ref and Asyl', 
            'SUM REFUGEE',
            'Adjusted savings: net national savings (current US$)',
            'Adjusted savings: particulate emission damage (current US$)',
            'Adolescent fertility rate (births per 1,000 women ages 15-19)',
            'Air transport, passengers carried',
            'Current health expenditure (% of GDP)',
            'Current health expenditure per capita (current US$)',
            'Death rate, crude (per 1,000 people)',
            'Domestic general government health expenditure per capita (current US$)',
            'Domestic private health expenditure per capita (current US$)',
            'Ease of doing business score (0 = lowest performance to 100 = best performance)',
            'Fixed broadband subscriptions (per 100 people)',
            'Fixed telephone subscriptions (per 100 people)',
            'GNI growth (annual %)',
            'International tourism, expenditures (current US$)',
            'International tourism, receipts (current US$)',
            'Military expenditure (current USD)', 'Population growth (annual %)',
            'Prevalence of undernourishment (% of population)',
            'Refugee population by country or territory of asylum',
            'Strength of legal rights index (0=weak to 12=strong)',
            'Unemployment, total (% of total labor force) (modeled ILO estimate)',
            'Net official flows from UN agencies: Total'
           ]

In [45]:
X = df[features]
y = df['GDP_annual_change']

## Null Model
---

In [46]:
df['base'] = df['GDP_annual_change'].mean()
RMSE = np.sqrt(metrics.mean_squared_error(df['GDP_annual_change'], df['base']))
RMSE

2.4666729263695077

The Baseline model has a Root Mean Squared Error of 2.47%. On average the predicted GDP Annual Change is about \\2.47% off from the true sales price.

## Regression Pipelines
---

In [47]:
def modelfunc(X, y):
    pipelines = [
        ('LINEAR REGRESSION', (Pipeline([ ('LR', LinearRegression())]))),
        ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeRegressor())]))),
        ('BAGGED TREE', (Pipeline([ ('BAG', BaggingRegressor())]))),
        ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestRegressor())]))),
        ('ADABOOST', (Pipeline([ ('ADA', AdaBoostRegressor())]))),
        ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsRegressor())]))),
        ('LASSO',(Pipeline([ ('sc', StandardScaler()),('LASSO', LassoCV())]))),
        ('RIDGE',(Pipeline([ ('sc', StandardScaler()),('RIDGE', RidgeCV())])))
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    for pipe_name ,model in pipelines:
        print(pipe_name)
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        trainscore = model.score(X_train, y_train)
        testscore = model.score(X_test, y_test)
        crossval = cross_val_score(model, X_train, y_train).mean()
        rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
        rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
        print (f'Model = {model}')
        print (f'Train Score = {trainscore}')
        print (f'Test Score = {testscore}')
        print (f'Cross Val Score = {crossval}')
        print (f'RMSE Train = {rmsetr}')
        print (f'RMSE Test = {rmsete}')
        print('')
        print('')

In [48]:
modelfunc(X, y)

LINEAR REGRESSION
Model = Pipeline(steps=[('LR', LinearRegression())])
Train Score = 0.5840815091198508
Test Score = 0.5543266541444651
Cross Val Score = 0.48571910010912883
RMSE Train = 1.628465335894157
RMSE Test = 1.5240839913050004


DECISION TREE
Model = Pipeline(steps=[('TREE', DecisionTreeRegressor())])
Train Score = 1.0
Test Score = 0.5465051150834861
Cross Val Score = 0.3489028430391234
RMSE Train = 5.781549060871421e-17
RMSE Test = 1.537399612205618


BAGGED TREE
Model = Pipeline(steps=[('BAG', BaggingRegressor())])
Train Score = 0.9348791664732816
Test Score = 0.6964687650119533
Cross Val Score = 0.5291036113041815
RMSE Train = 0.6443686153727438
RMSE Test = 1.2577729927513726


RANDOM FOREST
Model = Pipeline(steps=[('RAND', RandomForestRegressor())])
Train Score = 0.9462467287320052
Test Score = 0.7250452194189607
Cross Val Score = 0.579519428345489
RMSE Train = 0.5854325341925449
RMSE Test = 1.197102143028582


ADABOOST
Model = Pipeline(steps=[('ADA', AdaBoostRegressor())]

Random Forest performed the best according to the function above. Below I dig into it with some tuning.

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 

rand = RandomForestRegressor()
# rand.fit(X_train, y_train)

params = {
    'n_estimators': [75, 100, 125],
    'max_depth': [None, 3, 4, 5], 
    'max_features': ['auto', 'sqrt', 3, 4, 5]
}
gs = GridSearchCV(RandomForestRegressor(), 
                 param_grid = params, 
                 verbose = 1)

gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

# best_params = gridsearch.best_params_
# train_score = gridsearch.score(X_train, y_train)
# test_score = gridsearch.score(X_test, y_test)
# crossval = cross_val_score(gridsearch, X_train, y_train).mean()


# rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
# rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))


Fitting 5 folds for each of 60 candidates, totalling 300 fits
0.5764184297065357
{'max_depth': None, 'max_features': 'auto', 'n_estimators': 75}


## Predictions Using Ukranian Asylum Countries

Unnamed: 0,Year,Country,Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Stateless persons,Others of concern,Ref and Asyl,SUM REFUGEE,Adjusted savings: net national savings (current US$),Adjusted savings: particulate emission damage (current US$),"Adolescent fertility rate (births per 1,000 women ages 15-19)","Air transport, passengers carried",Current health expenditure (% of GDP),Current health expenditure per capita (current US$),"Death rate, crude (per 1,000 people)",Domestic general government health expenditure per capita (current US$),Domestic private health expenditure per capita (current US$),Ease of doing business score (0 = lowest performance to 100 = best performance),Fixed broadband subscriptions (per 100 people),Fixed telephone subscriptions (per 100 people),GNI growth (annual %),"International tourism, expenditures (current US$)","International tourism, receipts (current US$)",Military expenditure (current USD),Population growth (annual %),Prevalence of undernourishment (% of population),Refugee population by country or territory of asylum,Strength of legal rights index (0=weak to 12=strong),"Unemployment, total (% of total labor force) (modeled ILO estimate)",Net official flows from UN agencies: Total,GDP_annual_change
0,2022,Poland,2083854,0,0,0,0,2083854,2083854,,,,,,,,,,,,,,,,,,,,,,,
1,2022,Romania,535461,0,0,0,0,535461,535461,,,,,,,,,,,,,,,,,,,,,,,
2,2022,Republic of Moldova,365197,0,0,0,0,365197,365197,,,,,,,,,,,,,,,,,,,,,,,
3,2022,Hungary,312120,0,0,0,0,312120,312120,,,,,,,,,,,,,,,,,,,,,,,
4,2022,Slovakia,250036,0,0,0,0,250036,250036,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
predictions = model.predict(ref['Refugee Number'])

## PCA
---

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    # stratify=y,
                                                    random_state = 42)

In [25]:
# Instantiate PCA with 20 components.
pca = PCA(n_components=20)

# Fit PCA to training data.
pca.fit(X_train)

PCA(n_components=20)

In [29]:
# Instantiate linear regression model.
lr = LinearRegression()

# Transform Z_train and Z_test.
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)

# Fit on Z_train.

lr.fit(Z_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {round(lr.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(lr.score(Z_test, y_test),4)}')
print(f'Train Cross Val; : {round(cross_val_score(lr,Z_train, y_train).mean(),4  )}')
print(f'Test Cross Val; : {round(cross_val_score(lr,Z_test, y_test).mean(),4  )}')

Training Score: 0.1403
Testing Score: 0.089
Train Cross Val; : -0.0071
Test Cross Val; : -0.8063


## Stacking
---

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [None]:
level1_models = [
    ('GRAD', GradientBoostingClassifier()),
    ('ADA', AdaBoostClassifier()),
    ('LR', LinearRegression())
]


stack = StackingClassifier(estimators = level1_models, final_estimator = LinearRegression())


stack.fit(X_train, y_train)
trainscore = stack.score(X_train, y_train)
testscore = stack.score(X_test, y_test)
crossval = cross_val_score(stack, X_train, y_train).mean()
y_pred_train = stack.predict(X_train)
y_pred_test = stack.predict(X_test)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
mse_test = metrics.mean_absolute_error(y_test, y_pred_test)


print(f'Train: {trainscore}, Test: {testscore}, CV: {crossval}')
print (f'Mean Squared Error - Train = {mse_train}')
print (f'Mean Squared Error - Test = {mse_test}')