# XGBClassifier

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from imblearn.over_sampling import SMOTE

import re
import gc
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
def read_and_prepare_data(use_sample_data = True, max_empty_allowed=0.5):
    if use_sample_data:
        print ('Using sample data')
        df = pd.read_csv('testdata/hdcr10k.csv')
    else:
        print ('Using FULL data')
        # df = pd.read_csv('../data/cleaned/hcdr_FULL.csv',nrows=10000)
        df = pd.read_csv('../data/cleaned/hcdr_FULL.csv')

    df = df.drop(columns=['SK_ID_CURR'])

    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    df = df[~df.TARGET.isna()]
    df.TARGET.value_counts(dropna=False)

    num_rows = df.shape[0]

    # ELIMINATE COLUMNS IF PERCENTAGE EMPTY OVER n%
    lst_cols_to_delete = []
    num_rows = df.shape[0]
    num_columns = df.shape[1]
    for column in df.columns:
        if df[column].isna().sum()/num_rows > max_empty_allowed:
            lst_cols_to_delete.append(column)
    num_cols_to_drop = len(lst_cols_to_delete)
    num_cols_to_keep = num_columns - num_cols_to_drop
    print ('COLS TO DROP:',num_cols_to_drop)
    print ('COLS TO KEEP:',num_cols_to_keep)
    df = df.drop(columns=lst_cols_to_delete)

    lst_empty_cols = []
    for column in df.columns:
        if df[column].isnull().sum() == num_rows:
            lst_empty_cols.append(column)
    df = df.drop(columns = lst_empty_cols)

    if df.isna().sum().sum() > 0:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        imputer = SimpleImputer(missing_values=np.nan, strategy = 'median')
        df=pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)

        # Scale the features
        scaler = MinMaxScaler(feature_range = (0, 1))
        #df=pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

    return df
    
def get_Xy(df,target_column):
    X= df.drop(target_column,axis=1)
    y= df[target_column]
    return X, y

## With all columns

In [3]:
df = read_and_prepare_data(use_sample_data = True, max_empty_allowed=0.5)
X, y = get_Xy(df,'TARGET')
df.shape

Using sample data
COLS TO DROP: 560
COLS TO KEEP: 206


(10000, 206)

In [8]:
import pickle 

with open('top_features_shap.pkl', 'rb') as fp:
    top_features_shap = pickle.load(fp)

df = read_and_prepare_data(use_sample_data = True, max_empty_allowed=0.5)
df.shape

Using sample data
COLS TO DROP: 560
COLS TO KEEP: 206


(10000, 206)

In [9]:
model = XGBClassifier()
X, y = get_Xy(df,'TARGET')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1234)
params = {}
folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
grid = GridSearchCV(model, params, scoring='roc_auc', cv=folds, verbose=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
best_params = grid.best_params_
grid_score = grid.best_score_
best_model.fit(X_train, y_train)
y_predict = best_model.predict(X_test)
auc_score = roc_auc_score(y_test,y_predict)
print('AUC:',auc_score)
print(pd.crosstab(pd.Series(y_predict, name='Predicted'), pd.Series(y_test, name='Actual')))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................................., score=0.749 total time=   2.1s
[CV 2/5] END ..................................., score=0.705 total time=   1.7s
[CV 3/5] END ..................................., score=0.717 total time=   2.0s
[CV 4/5] END ..................................., score=0.692 total time=   1.6s
[CV 5/5] END ..................................., score=0.684 total time=   1.6s
AUC: 0.5224297504250984
Actual     0.0  1.0
Predicted          
0          841   65
1           10    2


## With only top 30 features from SHAP

In [10]:
df = read_and_prepare_data(use_sample_data = True, max_empty_allowed=0.5)
df = df[top_features_shap]
print (df.shape)

Using sample data
COLS TO DROP: 560
COLS TO KEEP: 206
(10000, 51)


In [11]:
model = XGBClassifier()
X, y = get_Xy(df,'TARGET')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1234)
params = {}
folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
grid = GridSearchCV(model, params, scoring='roc_auc', cv=folds, verbose=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
best_params = grid.best_params_
grid_score = grid.best_score_
best_model.fit(X_train, y_train)
y_predict = best_model.predict(X_test)
auc_score = roc_auc_score(y_test,y_predict)
print('AUC:',auc_score)
print(pd.crosstab(pd.Series(y_predict, name='Predicted'), pd.Series(y_test, name='Actual')))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................................., score=0.754 total time=   0.7s
[CV 2/5] END ..................................., score=0.714 total time=   0.6s
[CV 3/5] END ..................................., score=0.695 total time=   0.6s
[CV 4/5] END ..................................., score=0.714 total time=   0.6s
[CV 5/5] END ..................................., score=0.697 total time=   0.6s
AUC: 0.5153382112616794
Actual     0.0  1.0
Predicted          
0          840   66
1           11    1
