In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, accuracy_score

In [162]:
df = pd.read_csv('../data/interim/model_baseline/baseline_hispanic_blk.csv', index_col=0)

In [163]:
# Split df into X and y and drop columns with nulls
y = df.EOWN_ST
X = (df
     .dropna(axis='columns')
     .drop('EOWN_ST', axis='columns')
    )

# Change cat features to cat dtype
X[X.filter(regex='^E+.*').columns] = (X
                                      .filter(regex='^E+.*')
                                      .astype('category')
                                     )

# Drop recode columns
drop_cols = X.filter(regex='^R+.*').columns
X = X.drop(drop_cols, axis='columns')

# Rearrange columns order, group by type
cat_cols = list(X.select_dtypes('category').columns)
int_cols = list(X.select_dtypes(int).columns)
flt_cols = list(X.select_dtypes(float).columns)
num_cols = int_cols + flt_cols
cols_order = num_cols + cat_cols
X = X[cols_order]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

In [164]:
ohe = OneHotEncoder()
ss = StandardScaler()

col_xformer = make_column_transformer(
    (ohe, cat_cols),
    (ss, num_cols),
    remainder='passthrough')

In [165]:
X_train = col_xformer.fit_transform(X_train)

In [166]:
# Resample with SMOTE
cat_cols_index = np.arange(18, 35, 1)
smt = SMOTENC(random_state=23, categorical_features=cat_cols_index)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Resample with SMOTE+Tomek
smt_tomek = SMOTETomek(smote=SMOTENC(random_state=23, categorical_features=cat_cols_index),
                       tomek=TomekLinks(sampling_strategy='majority')
                      )
X_smt_tomek, y_smt_tomek = smt_tomek.fit_resample(X_train, y_train)

# Resample with SMOTE+ENN
smt_enn = SMOTEENN(smote=SMOTENC(random_state=23, categorical_features=cat_cols_index))
X_smt_enn, y_smt_enn = smt_enn.fit_resample(X_train, y_train)


## Logistic Regression

In [167]:
log_reg = LogisticRegression(max_iter=1000)

# cross-validation smote/log reg
smt_scores = cross_val_score(log_reg, X_smote, y_smote, cv=10)
smt_roc = cross_val_score(log_reg, X_smote, y_smote, cv=10, scoring='roc_auc')
print(f'Smote & Logistic Regression: \n---Accuracy: {smt_scores.mean()}\n---ROC/AUC: {smt_roc.mean()}\n')

# cross-validation smote tomek/log reg
smt_tomek_scores = cross_val_score(log_reg, X_smt_tomek, y_smt_tomek, cv=10)
smt_tomek_roc = cross_val_score(log_reg, X_smt_tomek, y_smt_tomek, cv=10, scoring='roc_auc')
print(f'Smote + Tomek & Logistic Regression: \n---Accuracy: {smt_tomek_scores.mean()} \n---ROC/AUC: {smt_tomek_roc.mean()}\n')

# cross-validation smote enn/log reg
smt_enn_scores = cross_val_score(log_reg, X_smt_enn, y_smt_enn, cv=10)
smt_enn_roc = cross_val_score(log_reg, X_smt_enn, y_smt_enn, cv=10, scoring='roc_auc')
print(f'Smote + ENN & Logistic Regression: \n---Accuracy: {smt_enn_scores.mean()} \n---ROC/AUC: {smt_enn_roc.mean()}\n')

Smote & Logistic Regression: 
---Accuracy: 0.865704442228869
---ROC/AUC: 0.940229727616934

Smote + Tomek & Logistic Regression: 
---Accuracy: 0.8663788905053622 
---ROC/AUC: 0.940708427011127

Smote + ENN & Logistic Regression: 
---Accuracy: 0.8981940144478845 
---ROC/AUC: 0.9632758273869306



In [168]:
smt_enn_logreg = log_reg.fit(X_smt_enn, y_smt_enn)

In [169]:
smt_enn_logreg.score(col_xformer.fit_transform(X_test), y_test)

0.8237436270939549

## SVM

In [133]:
svm = SVC()

# cross-validation smote/svm
smt_svm_scores = cross_val_score(svm, X_smote, y_smote, cv=10)
smt_svm_roc = cross_val_score(svm, X_smote, y_smote, cv=10, scoring='roc_auc')
print('SMOTE & SVM:')
print(f'Accuracy: {smt_svm_scores.mean()}')
print(f'ROC/AUC: {smt_svm_roc.mean()}\n')

# cross-validation smote + tomek/svm
smttomek_svm_scores = cross_val_score(svm, X_smt_tomek, y_smt_tomek, cv=10)
smttomek_svm_roc = cross_val_score(svm, X_smt_tomek, y_smt_tomek, cv=10, scoring='roc_auc')
print('SMOTE + Tomek & SVM:')
print(f'Accuracy: {smttomek_svm_scores.mean()}')
print(f'ROC/AUC: {smttomek_svm_roc.mean()}\n')

# cross-validation smote + enn/svm
smtenn_svm_scores = cross_val_score(svm, X_smt_enn, y_smt_enn, cv=10)
smtenn_svm_roc = cross_val_score(svm, X_smt_enn, y_smt_enn, cv=10, scoring='roc_auc')
print('SMOTE + ENN & SVM:')
print(f'Accuracy: {smtenn_svm_scores.mean()}')
print(f'ROC/AUC: {smtenn_svm_roc.mean()}\n')

SMOTE & SVM:
Accuracy: 0.9087807325639738
ROC/AUC: 0.9663922490362653

SMOTE + Tomek & SVM:
Accuracy: 0.9094790765271028
ROC/AUC: 0.9666761821600465

SMOTE + ENN & SVM:
Accuracy: 0.9345324988450516
ROC/AUC: 0.9829660433825607



In [171]:
smt_enn_svm = svm.fit(X_train, y_train)
print(smt_enn_svm.score(col_xformer.fit_transform(X_test), y_test))
svm_predictions = smt_enn_svm.predict(col_xformer.fit_transform(X_test))
print(roc_auc_score(y_test, svm_predictions))

0.9690458849235252
0.5


## XG Boost

In [141]:
from xgboost import XGBClassifier

In [142]:
xgb_scores = cross_val_score(XGBClassifier(), X_smt_enn, y_smt_enn, cv=10)
xgb_roc = cross_val_score(XGBClassifier(), X_smt_enn, y_smt_enn, cv=10, scoring='roc_auc')
xgb_scores.mean(), xgb_roc.mean()

















































































(0.9841046194277551, 0.9982528824559329)

In [151]:
# Hyperparameter tuning for XGBoost Using Grid/random search

scoring = {'AUC':'roc_auc', 'Accuracy':make_scorer(accuracy_score)}
params = {
    'min_child_weight': range(1, 6, 1),
    'max_depth': range(3, 11, 1),
    'gamma': [i/10.0 for i in range(0,5)],
    'subsample': [i/10 for i in range(5, 11, 1)],
    'colsample_bytree': [i/10 for i in range(5, 11, 1)],
    'verbosity': [0],
}

rscv = RandomizedSearchCV(XGBClassifier(), param_distributions=params, cv=10, scoring='roc_auc', n_iter=50)
search_results = rscv.fit(X_smt_enn, y_smt_enn)
search_results.best_score_



0.9987967004354481

In [152]:
search_results.best_params_

{'verbosity': 0,
 'subsample': 0.9,
 'min_child_weight': 1,
 'max_depth': 9,
 'gamma': 0.0,
 'colsample_bytree': 0.8}

In [154]:
final_xgb_scores = cross_val_score(XGBClassifier(subsample=0.9, min_child_weight=1,
                                                max_depth=9, gamma=0, colsample_bytree=0.8),
                                   X_smt_enn, y_smt_enn, cv=10, verbose=0
                                  )
final_xgb_roc = cross_val_score(XGBClassifier(subsample=0.9, min_child_weight=1,
                                                max_depth=9, gamma=0, colsample_bytree=0.8),
                                              X_smt_enn, y_smt_enn, cv=10, scoring='roc_auc', verbose=0)
final_xgb_scores.mean(), final_xgb_roc.mean()



(0.9858707152634807, 0.9987967004354481)

Process:
import
drop nulls
split into features/labels
drop recore features
set datatypes
rearrange columns by dtype

transform
split test/train
normalize
one-hot encode

Resample

cross-validate



In [158]:
predicted_y = search_results.predict(col_xformer.fit_transform(X_test))

In [159]:
from sklearn.metrics import roc_auc_score, accuracy_score
roc_auc_score(y_test, predicted_y)

0.6685860220170681

In [160]:
accuracy_score(y_test, predicted_y)

0.9242424242424242