# Regression Models: 
### Looking for numeric predictors of Countries that take in Refugees
---
Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

In [4]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

Read in Data

In [7]:
df = pd.read_csv('../../data/data_final.csv')

Set features & X and y

In [8]:
features = []

In [10]:
X = df[features]
#y = df[]

## Regression Pipelines
---

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    # stratify=y,
                                                    random_state = 42)

In [None]:
models = {'lr': LinearRegression(),
         'tree': DecisionTreeRegressor(),
          'rand': RandomForestRegressor(), 
          'Bag': BaggingRegressor(),
          'ada': AdaBoostRegressor()
         }

def modelfunc(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    model = models[model]
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    trainscore = model.score(X_train, y_train)
    testscore = model.score(X_test, y_test)
    crossval = cross_val_score(model, X_train, y_train).mean()
    rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
    rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
    print (f'Model = {model}')
    print (f'Train Score = {trainscore}')
    print (f'Test Score = {testscore}')
    print (f'Cross Val Score = {crossval}')
    print (f'RMSE Train = {rmsetr}')
    print (f'RMSE Test = {rmsete}')

## PCA
---

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    # stratify=y,
                                                    random_state = 42)

In [None]:
# Instantiate PCA with 20 components.
pca = PCA(n_components=20)

# Fit PCA to training data.
pca.fit(X_train)

In [None]:
# Instantiate linear regression model.
lr = LogisticRegression( max_iter=1000, random_state=42)

# Transform Z_train and Z_test.
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)

# Fit on Z_train.

lr.fit(Z_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {round(lr.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(lr.score(Z_test, y_test),4)}')
print(f'Train Cross Val; : {round(cross_val_score(lr,Z_train, y_train).mean(),4  )}')
print(f'Test Cross Val; : {round(cross_val_score(lr,Z_test, y_test).mean(),4  )}')

## Stacking
---

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [None]:
level1_models = [
    ('GRAD', GradientBoostingClassifier()),
    ('ADA', AdaBoostClassifier()),
    ('LR', LinearRegression())
]


stack = StackingClassifier(estimators = level1_models, final_estimator = LinearRegression())


stack.fit(X_train, y_train)
trainscore = stack.score(X_train, y_train)
testscore = stack.score(X_test, y_test)
crossval = cross_val_score(stack, X_train, y_train).mean()
y_pred_train = stack.predict(X_train)
y_pred_test = stack.predict(X_test)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
mse_test = metrics.mean_absolute_error(y_test, y_pred_test)


print(f'Train: {trainscore}, Test: {testscore}, CV: {crossval}')
print (f'Mean Squared Error - Train = {mse_train}')
print (f'Mean Squared Error - Test = {mse_test}')