## Import Libs & Data

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.combine import SMOTEENN
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, accuracy_score
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('../data/interim/model_baseline/baseline_hispanic_blk.csv', index_col=0)

## Original DF Transformations

In [3]:
# Split df into X and y and drop columns with nulls
y = df.EOWN_ST
X = (df
     .dropna(axis='columns')
     .drop('EOWN_ST', axis='columns')
    )

# Change cat features to cat dtype
X[X.filter(regex='^E+.*').columns] = (X
                                      .filter(regex='^E+.*')
                                      .astype('category')
                                     )

# Drop recode columns
drop_cols = X.filter(regex='^R+.*').columns
X = X.drop(drop_cols, axis='columns')

# Rearrange columns order, group by type
cat_cols = list(X.select_dtypes('category').columns)
int_cols = list(X.select_dtypes(int).columns)
flt_cols = list(X.select_dtypes(float).columns)
num_cols = int_cols + flt_cols
cols_order = num_cols + cat_cols
X = X[cols_order]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

In [4]:
# Split df into X and y and drop columns with nulls
y = df.EOWN_ST
X = (df
     .dropna(axis='columns')
     .drop('EOWN_ST', axis='columns')
    )

# Change cat features to cat dtype
X[X.filter(regex='^E+.*').columns] = (X
                                      .filter(regex='^E+.*')
                                      .astype('category')
                                     )

# Drop recode columns
drop_cols = X.filter(regex='^R+.*').columns
X = X.drop(drop_cols, axis='columns')

# Rearrange columns order, group by type
cat_cols = list(X.select_dtypes('category').columns)
int_cols = list(X.select_dtypes(int).columns)
flt_cols = list(X.select_dtypes(float).columns)
num_cols = int_cols + flt_cols
cols_order = num_cols + cat_cols
X = X[cols_order]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

## Feature Transformation

In [7]:
ohe = OneHotEncoder()
ss = StandardScaler()

col_xformer = make_column_transformer(
    (ohe, cat_cols),
    (ss, num_cols),
    remainder='passthrough')

## Resampling

In [8]:
X_train = col_xformer.fit_transform(X_train)

In [12]:
# Assign categorical column index to variable
cat_cols_index = np.arange(18, 35, 1)

# Resample with SMOTE+ENN
smote_enn = SMOTEENN(smote=SMOTENC(random_state=23, categorical_features=cat_cols_index))
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

## XGBoost Model Fitting

In [19]:
# Assign hyperparameters to a variable
params = {'verbosity': 0,
          'subsample': 0.9,
          'min_child_weight': 1,
          'max_depth': 9,
          'gamma': 0.0,
          'colsample_bytree': 0.8}

# Instantiate XGB
model = XGBClassifier(verbosity=0, 
                      subsample=0.9,
                      min_child_weight=1,
                      max_depth=9,
                      gamma=0.0,
                      colsample_bytree=0.8,
                     )

# Fit model
model.fit(X_train_resampled, y_train_resampled)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [20]:
model.save_model('../models/model.json')

## Export to Pickle

In [26]:
import pickle

In [30]:
filename = '../models/model_xgb_arthur.sav'
pickle.dump(model, open(filename, 'wb'))