In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.preprocessing import LabelEncoder # label encoding
from sklearn.model_selection import train_test_split # data split
from xgboost import XGBClassifier # XGBoost algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt # visualization
import itertools # advanced tools

import os
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix # evaluation metric
from sklearn.metrics import accuracy_score # evaluation metric
from sklearn.metrics import f1_score # evaluation metric
from sklearn.metrics import precision_score # evaluation metric
from sklearn.metrics import recall_score # evaluation metric
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE 
from sklearn.decomposition import PCA, FastICA
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
os.chdir(r"C:\Users\Wilmer\Desktop\marlette-ds-challenge")

# Read Data

In [3]:
df = pd.read_csv("training_data.csv/training_data.csv")

# Get an idea of what the data looks like

In [4]:
df.head()

Unnamed: 0,ID,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var189,var190,var191,var192,cat1,cat2,cat3,cat4,cat5,target
0,44686,86.52893,80.79771,75.25887,74.02016,69.01476,65.61648,63.23896,59.07834,56.80397,...,85.133333,84.45,85.2,85.9,S,H,C,B,C,0
1,44687,68.56225,72.05599,69.52573,68.79211,65.48515,63.00976,61.19186,57.85757,55.94791,...,90.533333,86.55,87.24,87.3,S,I,C,B,C,0
2,44688,77.88821,76.6227,73.11046,72.20956,68.26166,65.34046,63.19467,59.25676,57.01834,...,93.933333,90.2,89.84,88.6,S,I,C,B,C,0
3,44689,81.11949,78.43038,74.59578,73.63714,69.4554,66.35951,64.07976,59.88543,57.50303,...,93.2,88.15,88.48,87.766667,S,I,C,B,C,0
4,44690,62.18698,68.60618,67.86709,67.44987,65.15601,63.13671,61.52867,58.35072,56.4246,...,92.733333,88.15,88.0,88.566667,S,I,C,B,C,0


# Duplicates?

In [5]:
df = df.drop_duplicates()

We see that there are no duplicates in this dataset. 

# Null values?


In [6]:
df.isnull().sum().max()

0

No Null values either. 

# Encode Labels (cat 1- cat5)

In order to use our categorical variables, we need to encode them into numeric values. We can use the Label Encoder library from sklearn.

In [7]:
# Encode labels to numeric
labelencoder = LabelEncoder()
df["cat1"] = labelencoder.fit_transform(df["cat1"])
df["cat2"] = labelencoder.fit_transform(df["cat2"])
df["cat3"] = labelencoder.fit_transform(df["cat3"])
df["cat4"] = labelencoder.fit_transform(df["cat4"])
df["cat5"] = labelencoder.fit_transform(df["cat5"])

# Create holdout set to prevent data leakage

In order to make sure that our datasets are equally balanced, we can split our training and test dataset by stratifying on our target variable.  

In [8]:
trainDF, testDF = train_test_split(df, test_size=0.2, random_state=1234, stratify=df[["target"]])
tr_value_counts = trainDF["target"].value_counts()
print("Fraudulent transactions are %.2f%% of the training set." % (tr_value_counts[1] * 100 / len(trainDF)))
tst_value_counts = testDF["target"].value_counts()
print("Fraudulent transactions are %.2f%% of the test set." % (tst_value_counts[1] * 100 / len(testDF)))

Fraudulent transactions are 0.94% of the training set.
Fraudulent transactions are 0.95% of the test set.


# Create Test and Train sets

In [9]:
X_train = trainDF.iloc[:, trainDF.columns != "target"]
y_train = trainDF.iloc[:, trainDF.columns == "target"]
X_test = testDF.iloc[:, testDF.columns != "target"]
y_test = testDF.iloc[:, testDF.columns == "target"]
X_train.head()

Unnamed: 0,ID,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var188,var189,var190,var191,var192,cat1,cat2,cat3,cat4,cat5
8765,64732,63.524,62.02852,60.63958,60.24064,58.16411,56.24291,54.64911,51.62154,50.10868,...,146.0,140.4,138.95,151.24,141.0,7,7,1,1,1
3020,50821,75.19215,66.57365,61.85938,61.01287,58.09891,56.51396,55.57799,54.20943,53.46899,...,14.8,13.2,12.7,14.52,17.533333,11,4,1,1,1
3291,52027,80.86206,68.5162,65.04445,64.48007,62.36521,60.72348,59.2497,55.63899,53.26023,...,84.5,86.333333,82.6,82.76,83.333333,6,6,1,1,1
3212,51948,18.49919,25.97824,29.54745,30.22504,32.75381,34.41779,35.66354,38.34208,40.20942,...,117.3,113.933333,112.1,111.88,115.6,6,0,1,0,0
247,44933,36.65,29.32237,29.21855,29.41497,30.90176,32.66956,34.40505,38.68626,41.54717,...,82.2,76.333333,71.25,73.76,75.166667,17,0,2,1,0


# SMOTE

We use SMOTE to over sample our dataset in order to fix the problems that come with an extrememly imbalanced dataset.

In [10]:
X_train_smote, y_train_smote = SMOTE(random_state=1234).fit_resample(X_train, y_train)
smote_value_counts = y_train_smote["target"].value_counts()
print("Fraudulent transactions are %.2f%% of the test set." % (smote_value_counts[0] * 100 / len(y_train_smote)))

Fraudulent transactions are 50.00% of the test set.


We see here that after we over sample, we have half of our data as our target variable.

# Logistic Regression Training

First we fit our logistic regression model and we find the best performing parameters using grid search. 

In [11]:
def lr_search(X, y, search_verbose=1):
    space = dict()
    space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
    space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
    space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    lr = LogisticRegression()
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
    grid_search = GridSearchCV(estimator=lr, param_grid=space, scoring="roc_auc", n_jobs=-1, cv=skf.split(X,y), verbose=search_verbose)
    grid_search.fit(X, y)
    print("Best estimator: ")
    print(grid_search.best_estimator_)
    print("Parameters: ", grid_search.best_params_)
    print("Highest AUC: %.2f" % grid_search.best_score_)
    return grid_search.best_params_

In [12]:
rows = random.sample(np.arange(0,len(X_train_smote.index)).tolist(), 10000)
model_params =  lr_search(X_train_smote.iloc[rows,], y_train_smote.iloc[rows,])

Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best estimator: 
LogisticRegression(C=100, solver='newton-cg')
Parameters:  {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Highest AUC: 0.96


In [13]:
lr_model = LogisticRegression()
lr_model.set_params(**model_params)
lr_model.fit(X_train_smote, y_train_smote)

LogisticRegression(C=100, solver='newton-cg')

# CatBoost Training

In [14]:

def cb_search(X, y, search_verbose=0):
    params = {
        "max_depth":[3,4,5,6],
        "learning_rate": [1, 0.1, 0.01, 0.001]
    }
    cb = CatBoostClassifier(eval_metric="AUC")
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
    grid_search = GridSearchCV(estimator=cb, param_grid=params, scoring="roc_auc", n_jobs=-1, cv=skf.split(X,y), verbose=False)
    grid_search.fit(X, y, verbose = False)
    print("Best estimator: ")
    print(grid_search.best_estimator_)
    print("Parameters: ", grid_search.best_params_)
    print("Highest AUC: %.2f" % grid_search.best_score_)
    return grid_search.best_params_

In [15]:
rows = random.sample(np.arange(0,len(X_train_smote.index)).tolist(), 10000)
model_params = cb_search(X_train_smote.iloc[rows,], y_train_smote.iloc[rows,])

Best estimator: 
<catboost.core.CatBoostClassifier object at 0x00000227069E5AF0>
Parameters:  {'learning_rate': 0.1, 'max_depth': 6}
Highest AUC: 1.00


In [16]:
cb_model = CatBoostClassifier(eval_metric="AUC")
cb_model.set_params(**model_params)
cb_model.fit(X_train_smote, y_train_smote, verbose = False)

<catboost.core.CatBoostClassifier at 0x2270b562130>

# XG Boost Training

In [17]:
def xgboost_search(X, y, search_verbose=1):
    params = {
        "gamma":[0.5, 1, 2, 5],
        "max_depth":[3,4,5,6],
        "min_child_weight": [100],
        "subsample": [0.6, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "learning_rate": [1, 0.1, 0.01, 0.001]
    }
    xgb = XGBClassifier(objective="binary:logistic", eval_metric="auc", use_label_encoder=False)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
    grid_search = GridSearchCV(estimator=xgb, param_grid=params, scoring="roc_auc", n_jobs=-1, cv=skf.split(X,y), verbose=search_verbose)
    grid_search.fit(X, y)
    print("Best estimator: ")
    print(grid_search.best_estimator_)
    print("Parameters: ", grid_search.best_params_)
    print("Highest AUC: %.2f" % grid_search.best_score_)
    return grid_search.best_params_

We use grid search to find the best paramters for our model

In [18]:
rows = random.sample(np.arange(0,len(X_train_smote.index)).tolist(), 10000)
model_params = xgboost_search(X_train_smote.iloc[rows,], y_train_smote.iloc[rows,])

Fitting 3 folds for each of 384 candidates, totalling 1152 fits
Best estimator: 
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, eval_metric='auc', gamma=0.5, gpu_id=-1,
              importance_type=None, interaction_constraints='', learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=100, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
Parameters:  {'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 1, 'max_depth': 3, 'min_child_weight': 100, 'subsample': 1.0}
Highest AUC: 0.99


In [19]:
model = XGBClassifier(objective="binary:logistic", eval_metric="auc", use_label_encoder=False)
model.set_params(**model_params)
model.fit(X_train_smote, y_train_smote)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, eval_metric='auc', gamma=0.5, gpu_id=-1,
              importance_type=None, interaction_constraints='', learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=100, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

# Results

### Logistic Regression AUC

In [22]:
y_pred = lr_model.predict_proba(X_test)[:,1]
fp_r, tp_r, t = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fp_r, tp_r)
print("AUC:",auc)

AUC: 0.9114772667404245


### XG Boost AUC

In [23]:
y_pred = model.predict_proba(X_test)[:,1]
fp_r, tp_r, t = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fp_r, tp_r)
print("AUC:",auc)

AUC: 0.9235419630156472


### CatBoost AUC

In [24]:
y_pred = cb_model.predict_proba(X_test)[:,1]
fp_r, tp_r, t = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fp_r, tp_r)
print("AUC:",auc)

AUC: 0.9235551340814498


If we look at all of our models, we see that CatBoost performs the best. 