# Fraud Detection using XGBoost

In this notebook we will study Fraud Detection using a special decision tree: XGBoost, a gradient boosted tree.

The implementation is very easy and we'll see that it's waaay better than an autoencoder for this case.

In [3]:
import numpy as np
import pandas as pd
import xgboost

DATASET_PATH = ''

trainID = pd.read_csv(DATASET_PATH+'/train_identity.csv', encoding='utf-8')
trainID.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [4]:
trainTR = pd.read_csv(DATASET_PATH+'/train_transaction.csv', encoding='utf-8')
FraudData = trainTR[trainTR['isFraud'] == 1]
trainTR = trainTR.iloc[:len(trainTR)//5,:]
trainTR = trainTR.set_index('TransactionID').combine_first(FraudData.set_index('TransactionID')).reset_index()
trainTR.head()
del FraudData

In [5]:
def make_hour_feature(f):
    #Creates an hour of the day feature, encoded as 0-23.  
    hours = f / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

def preprocessData(dataTR, dataID, training=True):
    train = pd.merge(dataTR, dataID, on='TransactionID', how='left')
    del dataTR; del dataID
    
    train['hour'] = make_hour_feature(train['TransactionDT'])

    exclude = ['TransactionID', 'TransactionDT', 'isFraud']
        
    cat_features = [f for f in train.columns if((f not in exclude) and (train[f].dtype=='object'))]
    print(cat_features)
    num_features = [f for f in train.columns if (f not in cat_features) & (f not in exclude)]
    
    # fill numeric NAs with median
    for tab in num_features:
        train.loc[train[tab].isnull(), tab] = train[tab].median()

    # fill categorical NAs with "m" for missing and "nm" for not missing
    for tab in cat_features:
        train.loc[train[tab].isnull(), tab] = "m"
    
    if(training): train.drop(columns=['TransactionID', 'TransactionDT'], axis=1, inplace=True)
    else: train.drop(columns=['TransactionDT'], axis=1, inplace=True)
    
    return train, num_features, cat_features

In [6]:
train, num_features, cat_features = preprocessData(trainTR, trainID)
FraudData = train[train['isFraud'] == 1]
del trainTR; del trainID

['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']


In [7]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(train[num_features+list(cat_features)], train['isFraud'], test_size=0.1)
fraudData_X = FraudData[num_features+list(cat_features)]
fraudData_Y = FraudData['isFraud']
print(len(train_X), 'train examples')
print(len(val_X), 'validation examples')
print(len(fraudData_X), 'fraud data')

del train; del FraudData

122350 train examples
13595 validation examples
20663 fraud data


*Note: Decisions trees don't need normalized inputs in order to work well so we skip this part*

In [8]:
from sklearn.preprocessing import LabelEncoder

for f in train_X.columns:
    if train_X[f].dtype=='object' or val_X[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_X[f].values) + list(val_X[f].values))
        train_X[f] = lbl.transform(list(train_X[f].values))
        val_X[f] = lbl.transform(list(val_X[f].values))
        fraudData_X[f] = lbl.transform(list(fraudData_X[f].values))

del num_features; del cat_features

print(train_X.shape, val_X.shape, fraudData_X.shape)

(122350, 432) (13595, 432) (20663, 432)


In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

clf = xgboost.XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        missing=-999,
        random_state=2019,
        tree_method='gpu_hist',
        n_jobs = -1)

clf.fit(train_X, train_y)
y_pred_train = clf.predict_proba(val_X)[:,1]
y_pred_train_flat = clf.predict(val_X)
y_pred_fraud = clf.predict(fraudData_X)
print('Accuracy {}'.format(accuracy_score(val_y.values, y_pred_train_flat)))
print('ROC AUC {}'.format(roc_auc_score(val_y.values, y_pred_train)))
print('Fraud Accuracy {}'.format(accuracy_score(fraudData_Y.values, y_pred_fraud)))

Accuracy 0.9693269584406031
ROC AUC 0.983770821311129
Fraud Accuracy 0.893142331704012


The results are very good !

We now have 89% True negative and an accuracy of 97%.