# Importing the necessary libraries

In [234]:
import pandas as pd
import numpy as np
np.random.seed(0)

import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix

from scipy.stats.stats import pearsonr   
from scipy.stats import uniform, randint

from tqdm import tqdm


import warnings
warnings.filterwarnings("ignore")

# Import the data

In [2]:
full_df = pd.read_parquet("/Users/omar/Desktop/dataset/fraud_dataset.parquet")
full_df

Unnamed: 0,acquirerid,agriculture_list,amount,bankid,batchamount,card,construction_list,cup,datetime,emv,...,rollingsum_card_24h,rollingsum_card_merchant_24h,rollingsum_merchant_24h,rollingsum_merchant_2h,rollingsum_merchant_900s,size,super_market_list,terminal,travel_list,trc
0,9,0,106.57,6,106.57,250,0,0,2016-01-01 00:03:42,0,...,106.57,106.57,106.57,106.57,106.57,-2,0,507,0,0
1,6,0,45.35,4,45.35,7389,0,0,2016-01-01 00:10:42,0,...,45.35,45.35,45.35,45.35,45.35,0,0,688,0,0
2,7,0,145.26,102,145.26,1615,1,0,2016-01-01 00:10:42,0,...,145.26,145.26,145.26,145.26,145.26,-1,0,100,0,1
3,3,0,803.08,2,803.08,5896,0,0,2016-01-01 00:11:30,1,...,803.08,803.08,803.08,803.08,803.08,2,0,126,0,0
4,9,0,1329.56,100,1329.56,5245,0,0,2016-01-01 00:13:18,1,...,1329.56,1329.56,1329.56,1329.56,1329.56,2,0,309,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5407393,5,0,456.78,102,3484.75,1185,0,0,2017-01-31 23:59:06,0,...,626.39,456.78,37631.35,3484.75,917.71,-1,0,585,0,0
5407394,7,0,69.66,10,17549.58,4196,0,0,2017-01-31 23:59:24,0,...,213.78,69.66,179499.72,17549.58,69.66,1,0,562,0,0
5407395,6,0,57.32,7,10827.32,6883,0,0,2017-01-31 23:59:36,1,...,939.82,57.32,122672.32,10827.32,57.32,2,0,490,1,0
5407396,2,0,371.20,6,11396.82,8963,0,0,2017-01-31 23:59:48,0,...,797.39,371.20,137895.18,11396.82,972.36,1,0,692,0,0


## Reducing the memory usage

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Under sampling the dataset

In [4]:
n = 100000
n_non_fraud = int(0.999*n)   # Keep 0.01% of frauds
n_fraud = n - n_non_fraud

In [5]:
df_fraud = full_df.loc[full_df.fraud==1].sample(n_fraud)
df_non_fraud = full_df.loc[full_df.fraud==0].sample(n_non_fraud)

In [6]:
df = pd.concat([df_fraud, df_non_fraud]).sample(frac=1).reset_index()

df.loc[:, 'day'] = pd.to_datetime(df.datetime).dt.day
df.loc[:, 'time'] = pd.to_datetime(df.datetime).dt.second + pd.to_datetime(df.datetime).dt.minute*60 + pd.to_datetime(df.datetime).dt.hour*3600

df = df.drop(columns="datetime")

df = reduce_mem_usage(df)

Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 6.68 MB
Decreased by 78.1%


In [7]:
df.fraud.value_counts()

0    99900
1      100
Name: fraud, dtype: int64

## Splitting the data into train/valid/test

In [8]:
train_df = df[:int(n/2)].reset_index(drop=True).drop(columns="level_0")
valid_df = df[int(n/2):int(3*n/4)].drop(columns="level_0").reset_index(drop=True)
test_df = df[int(3*n/4):].drop(columns="level_0").reset_index(drop=True)

## Importing previously made datasets using the presented method

In [312]:
train_df = pd.read_csv("/Users/omar/Desktop/dataset/train_dataset.csv").drop(columns=["Unnamed: 0", "level_0"])
valid_df = pd.read_csv("/Users/omar/Desktop/dataset/valid_dataset.csv").drop(columns=["Unnamed: 0", "level_0"])
test_df = pd.read_csv("/Users/omar/Desktop/dataset/test_dataset.csv").drop(columns=["Unnamed: 0", "level_0"])

train_df = pd.concat([train_df, valid_df]).reset_index(drop=True)

In [298]:
train_df

Unnamed: 0,acquirerid,agriculture_list,amount,bankid,batchamount,card,construction_list,cup,emv,foreignbin,...,rollingsum_merchant_24h,rollingsum_merchant_2h,rollingsum_merchant_900s,size,super_market_list,terminal,travel_list,trc,day,time
0,3,0,112.70,2,3869.06,7105,0,0,0,0,...,18174.56,1675.25,112.7,-2,0,584,0,0,13,11466
1,6,0,496.00,100,8186.49,1145,0,0,0,1,...,43159.51,3423.56,858.0,-1,0,528,0,0,27,6732
2,7,0,1429.00,4,274697.60,8968,0,0,0,0,...,316197.90,31508.84,4972.0,2,0,652,0,0,25,67782
3,1,0,303.20,5,54637.88,3515,1,0,1,0,...,98676.34,9541.70,2040.0,1,0,466,0,0,30,38346
4,6,0,22.23,101,104361.73,2915,0,0,0,1,...,134728.16,14108.41,2572.0,0,0,627,0,0,20,55014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74995,4,0,66.70,100,4842.94,1210,0,0,0,1,...,52342.99,1722.09,738.5,0,0,697,0,0,17,4842
74996,6,0,152.10,4,71388.33,9093,0,0,0,0,...,79160.42,6956.55,860.5,0,0,645,0,0,23,70350
74997,10,0,503.50,1,38013.46,5906,0,0,1,0,...,96166.36,9242.08,1058.0,1,0,78,0,0,24,23910
74998,7,0,508.50,6,99111.64,2166,0,0,0,0,...,375530.56,42443.96,8504.0,2,0,639,0,0,4,12510


# XGBoost classifier on normal dataset

In [290]:
y_train = train_df.fraud
X_train = train_df.drop(columns='fraud')

y_test = test_df.fraud
X_test = test_df.drop(columns='fraud')

In [47]:
xgboost_clf = xgb.XGBClassifier(n_jobs = -1)

params = { 'max_depth' : [3,6,10],
           'learning_rate' : [0.01, 0.05, 0.1],
           'n_estimators' : [100, 200, 500],
           'colsample_bytree' : [0.3, 0.6, 0.9],
           'alpha' : [0, 1, 2]}

clf = GridSearchCV(estimator=xgboost_clf,
                   scoring='f1',
                   param_grid=params,
                   verbose=2)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.0s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.

[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=6, n_estimators=500; total time=   4.4s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.8s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   1.0s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.8s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.1s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.6s
[CV] END alpha=0, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.0s
[C

[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   3.6s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.5s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.5s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   1.0s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   0.9s
[CV] END a

[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.05, max_depth=10, n_estimators=500; total time=   4.7s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.8s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.7s
[CV] END alpha=0, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.6s
[CV] END alpha=0,

[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=6, n_estimators=500; total time=   5.2s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.4s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.5s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.6s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.5s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.5s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   1.0s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   1.0s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   1.0s
[C

[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   4.1s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.2s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.3s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.2s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.3s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.2s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.2s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.3s
[CV] END alpha=0, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.3s
[CV] END alpha=0, c

[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=500; total time=   3.0s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.3s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.3s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.3s
[CV] END 

[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=6, n_estimators=500; total time=   3.2s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.1s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   0.9s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.0s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.0s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.6s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   3.0s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   1.8s
[CV] END alpha=1, colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   2.1s
[CV] END al

[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=500; total time=   3.9s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=100; total time=   0.8s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=100; total time=   0.8s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=100; total time=   0.7s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=100; total time=   0.7s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=100; total time=   0.7s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=200; total time=   2.1s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=200; total time=   2.1s
[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=200; total time=   2.1s
[CV] END a

[CV] END alpha=1, colsample_bytree=0.6, learning_rate=0.1, max_depth=10, n_estimators=500; total time=   3.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.9s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.0s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.9s
[CV] END a

[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=6, n_estimators=500; total time=   4.7s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.8s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.9s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   1.0s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.8s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=100; total time=   0.9s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.2s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.5s
[CV] END alpha=1, colsample_bytree=0.9, learning_rate=0.05, max_depth=10, n_estimators=200; total time=   2.3s
[C

[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=   3.1s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.5s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.01, max_depth=6, n_estimators=200; total time=   0.8s
[CV] END a

[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.05, max_depth=10, n_estimators=500; total time=   3.6s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.9s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.5s
[CV] END alpha=2, colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=200; total time=   1.9s
[CV] END alpha=2,

[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=6, n_estimators=500; total time=   3.8s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.5s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.5s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   0.4s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   0.9s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.01, max_depth=10, n_estimators=200; total time=   1.1s
[C

[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=500; total time=   3.1s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.2s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.4s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.1s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.0s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100; total time=   1.0s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.1s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.1s
[CV] END alpha=2, colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=200; total time=   2.2s
[CV] END alpha=2, c

[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.01, max_depth=10, n_estimators=500; total time=   4.4s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.9s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   2.2s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   2.0s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.8s
[CV] END 

[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=6, n_estimators=500; total time=   3.3s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.1s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.4s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.1s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.1s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=100; total time=   1.2s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   2.5s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   2.2s
[CV] END alpha=2, colsample_bytree=0.9, learning_rate=0.1, max_depth=10, n_estimators=200; total time=   2.4s
[CV] END al

In [276]:
clf.best_score_

0.8257894736842106

Le meilleur score obtenu avec le XGBoost classique est : F1-Score = 0.82578

Avec :
max_depth = 6,
alpha = 0,
colsample_bytree = 0.9,
gamma = 0,
learning_rate = 0.01,
n_estimators = 500

In [50]:
clf.best_estimator_

### Scores sur les données de validation

In [279]:
valid_pred_proba = clf.predict_proba(X_valid)[:, 1]
valid_pred = (valid_pred_proba > 1/2).astype('int')

roc_auc_score(y_valid, valid_pred)

0.8846153846153846

Pour l'Out-Of-Sample F1-Score : on a obtenu 

F1-score = 0.8695

## Auto-encoder predictions

In [313]:
errors_ae_test_df = pd.read_csv("/Users/omar/Desktop/dataset/drive-download-20220916T133805Z-001/errors_test_AE.csv").drop(columns="Unnamed: 0").rename(columns={"0": "ae_error"})
errors_ae_valid_df = pd.read_csv("/Users/omar/Desktop/dataset/drive-download-20220916T133805Z-001/errors_valid_AE.csv").drop(columns="Unnamed: 0").rename(columns={"0": "ae_error"})
errors_ae_train_df = pd.read_csv("/Users/omar/Desktop/dataset/drive-download-20220916T133805Z-001/errors_train_AE.csv").drop(columns="Unnamed: 0").rename(columns={"0": "ae_error"})

errors_ae_train_df = pd.concat([errors_ae_train_df, errors_ae_valid_df]).reset_index(drop=True)

## One Class SVM predictions

In [314]:
svm_test_df = pd.read_csv("/Users/omar/Desktop/dataset/One Class SVM/testSVM.csv")
svm_valid_df = pd.read_csv("/Users/omar/Desktop/dataset/One Class SVM/validSVM.csv")
svm_train_df = pd.read_csv("/Users/omar/Desktop/dataset/One Class SVM/trainSVM.csv")

svm_train_df = pd.concat([svm_train_df, svm_valid_df]).reset_index(drop=True)

## Deep SVDD predictions

In [315]:
svdd_df = pd.read_csv("/Users/omar/Desktop/dataset/results_SVDD.csv")
svdd_test_df = svdd_df.test_data[:25000]
svdd_valid_df = svdd_df.valid_data[:25000]
svdd_train_df = svdd_df.train_data

svdd_train_df = pd.concat([svdd_train_df, svdd_valid_df]).reset_index(drop=True)

## Isolation forest predictions

In [316]:
if_df = pd.read_csv("/Users/omar/Desktop/dataset/results_IsolationForest.csv")

if_test_df = if_df.test_data[:25000]
if_valid_df = if_df.valid_data[:25000]
if_train_df = if_df.train_data

if_train_df = pd.concat([if_train_df, if_valid_df]).reset_index(drop=True)

## Sélection des features à rajouter

In [317]:
def psi(features, feature, y_true):
    acc = roc_auc_score(y_true, features[feature])
    
    sum_corr = 0
    
    for feature_name in features:
        sum_corr += abs(pearsonr(features[feature], features[feature_name]).statistic)
    
    return acc / sum_corr

In [318]:
def feature_selection_XGBOD(features, y_true, p):
    """
    This function implements the Balance Selection algorithm
    of (Zhao, Et al.) "XGBOD: Improving Supervised Outlier Detection with Unsupervised Representation Learning".
    features : dictionary of numpy arrays
    y_true : numpy.array containg the true values
    p : number of features to be selected (if p>=len(features), we select all the features)
    
    return : a dictionary containing the relevant features
    """
    
    assert(len(y_true) == len(features[list(features.keys())[0]]))
    
    if p >= len(features):
        print("\033[1;40;41m The number of features to be selected in greater or equal to the number of features. No selection will be performed !\n")
        return features
    
    # On sélectionne la feature avec le plus grand ROC
    max_roc = 0
    max_roc_feature = ""
    for feature in features.keys():
        feature_values = features[feature]
        roc = roc_auc_score(y_true, feature_values)
        if roc > max_roc:
            max_roc = roc
            max_roc_feature = feature
    
    result_dict = {}
    result_dict[max_roc_feature] = features[max_roc_feature]
    features.pop(max_roc_feature, None)
    
    while len(result_dict) < p:
        max_psi = 0
        max_psi_feature = ""
        for feature in features.keys():
            psi_score = psi(features, feature, y_true)
            if psi_score > max_psi:
                max_psi = psi_score
                max_psi_feature = feature
                
        result_dict[max_psi_feature] = features[max_psi_feature]
        features.pop(max_psi_feature, None)
    
    return result_dict

## Adding TOS

### Which TOS to add ?

In [343]:
features = {"ae" : errors_ae_train_df.ae_error, "svm" : svm_train_df.predfraud, "svdd" : svdd_train_df, 'if' : if_train_df}
p = 4

features_to_add = feature_selection_XGBOD(features, y_train, p)

features_to_add = list(features_to_add.keys())

print("Features to select :")
features_to_add

[1;40;41m The number of features to be selected in greater or equal to the number of features. No selection will be performed !

Features to select :


['ae', 'svm', 'svdd', 'if']

In [344]:
if "ae" in features_to_add:
    train_df.loc[:, 'ae_error'] = errors_ae_train_df.ae_error
    valid_df.loc[:, 'ae_error'] = errors_ae_valid_df.ae_error
    test_df.loc[:, 'ae_error'] = errors_ae_test_df.ae_error

if "svm" in features_to_add:
    train_df.loc[:, 'svm'] = svm_train_df.predfraud
    valid_df.loc[:, 'svm'] = svm_valid_df.predfraud
    test_df.loc[:, 'svm'] = svm_test_df.predfraud
    
if "svdd" in features_to_add:
    train_df.loc[:, 'svdd'] = svdd_train_df
    valid_df.loc[:, 'svdd'] = svdd_valid_df
    test_df.loc[:, 'svdd'] = svdd_test_df

if "if" in features_to_add:
    train_df.loc[:, 'if'] = if_train_df
    valid_df.loc[:, 'if'] = if_valid_df
    test_df.loc[:, 'if'] = if_test_df

In [345]:
train_df.columns

Index(['acquirerid', 'agriculture_list', 'amount', 'bankid', 'batchamount',
       'card', 'construction_list', 'cup', 'emv', 'foreignbin', 'fraud',
       'fuel_list', 'index', 'list_bankid_b', 'list_bankid_ci', 'mcc',
       'mean_merchant_amount', 'medic_list', 'merchant', 'pospayenvcode',
       'ref', 'resp_code', 'riskmerchant', 'rollingcount_card_merchant_600s',
       'rollingcount_merchant_24h', 'rollingcount_merchant_2h',
       'rollingcount_merchant_900s', 'rollingsum_card_24h',
       'rollingsum_card_merchant_24h', 'rollingsum_merchant_24h',
       'rollingsum_merchant_2h', 'rollingsum_merchant_900s', 'size',
       'super_market_list', 'terminal', 'travel_list', 'trc', 'day', 'time',
       'ae_error', 'svm', 'svdd', 'if'],
      dtype='object')

# Classifier

In [346]:
y_train = train_df.fraud
X_train = train_df.drop(columns=['fraud', 'ref'])

y_valid = valid_df.fraud
X_valid = valid_df.drop(columns=['fraud', 'ref'])

y_test = test_df.fraud
X_test = test_df.drop(columns=['fraud', 'ref'])

In [347]:
xgboost_clf_TOS = xgb.XGBClassifier(n_jobs = -1)

params =  {'max_depth' : randint(6, 10),
           'learning_rate' : uniform(0.05, 0.09),
           'n_estimators' : randint(250, 351),
           'colsample_bytree' : uniform(0.8, 0.2),
           'alpha' : uniform(0, 0.2)}

clf_TOS = RandomizedSearchCV(estimator=xgboost_clf_TOS,
                             scoring='f1',
                             param_distributions=params,
                             verbose=2,
                             n_iter = 50,
                             cv=5,
                            )

clf_TOS.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END alpha=0.06518478796429983, colsample_bytree=0.9416720534073194, learning_rate=0.08534830513140255, max_depth=7, n_estimators=298; total time=   5.1s
[CV] END alpha=0.06518478796429983, colsample_bytree=0.9416720534073194, learning_rate=0.08534830513140255, max_depth=7, n_estimators=298; total time=   5.2s
[CV] END alpha=0.06518478796429983, colsample_bytree=0.9416720534073194, learning_rate=0.08534830513140255, max_depth=7, n_estimators=298; total time=   5.2s
[CV] END alpha=0.06518478796429983, colsample_bytree=0.9416720534073194, learning_rate=0.08534830513140255, max_depth=7, n_estimators=298; total time=   5.0s
[CV] END alpha=0.06518478796429983, colsample_bytree=0.9416720534073194, learning_rate=0.08534830513140255, max_depth=7, n_estimators=298; total time=   5.0s
[CV] END alpha=0.055905988675762845, colsample_bytree=0.8124449139967882, learning_rate=0.13902091748939754, max_depth=6, n_estimators=258; total ti

[CV] END alpha=0.035090969977383216, colsample_bytree=0.8943931277527626, learning_rate=0.10464841381630031, max_depth=8, n_estimators=291; total time=   4.7s
[CV] END alpha=0.035090969977383216, colsample_bytree=0.8943931277527626, learning_rate=0.10464841381630031, max_depth=8, n_estimators=291; total time=   4.5s
[CV] END alpha=0.035090969977383216, colsample_bytree=0.8943931277527626, learning_rate=0.10464841381630031, max_depth=8, n_estimators=291; total time=   5.1s
[CV] END alpha=0.16603931596701038, colsample_bytree=0.9685869718602564, learning_rate=0.08731797342573988, max_depth=7, n_estimators=301; total time=   5.4s
[CV] END alpha=0.16603931596701038, colsample_bytree=0.9685869718602564, learning_rate=0.08731797342573988, max_depth=7, n_estimators=301; total time=   5.6s
[CV] END alpha=0.16603931596701038, colsample_bytree=0.9685869718602564, learning_rate=0.08731797342573988, max_depth=7, n_estimators=301; total time=   5.3s
[CV] END alpha=0.16603931596701038, colsample_byt

[CV] END alpha=0.09953907062346892, colsample_bytree=0.9915019611196795, learning_rate=0.06991159517083712, max_depth=7, n_estimators=293; total time=   5.8s
[CV] END alpha=0.09953907062346892, colsample_bytree=0.9915019611196795, learning_rate=0.06991159517083712, max_depth=7, n_estimators=293; total time=   5.9s
[CV] END alpha=0.09953907062346892, colsample_bytree=0.9915019611196795, learning_rate=0.06991159517083712, max_depth=7, n_estimators=293; total time=   5.8s
[CV] END alpha=0.09953907062346892, colsample_bytree=0.9915019611196795, learning_rate=0.06991159517083712, max_depth=7, n_estimators=293; total time=   6.2s
[CV] END alpha=0.09953907062346892, colsample_bytree=0.9915019611196795, learning_rate=0.06991159517083712, max_depth=7, n_estimators=293; total time=   5.8s
[CV] END alpha=0.08375747891421244, colsample_bytree=0.9173112810669182, learning_rate=0.11154046346023248, max_depth=8, n_estimators=254; total time=   5.5s
[CV] END alpha=0.08375747891421244, colsample_bytree

[CV] END alpha=0.13393896635882505, colsample_bytree=0.9608420371517622, learning_rate=0.08468451071441951, max_depth=6, n_estimators=306; total time=   5.4s
[CV] END alpha=0.13393896635882505, colsample_bytree=0.9608420371517622, learning_rate=0.08468451071441951, max_depth=6, n_estimators=306; total time=   5.1s
[CV] END alpha=0.13393896635882505, colsample_bytree=0.9608420371517622, learning_rate=0.08468451071441951, max_depth=6, n_estimators=306; total time=   5.2s
[CV] END alpha=0.11755624524603275, colsample_bytree=0.9466451743784392, learning_rate=0.12700818692912708, max_depth=9, n_estimators=304; total time=   4.7s
[CV] END alpha=0.11755624524603275, colsample_bytree=0.9466451743784392, learning_rate=0.12700818692912708, max_depth=9, n_estimators=304; total time=   5.1s
[CV] END alpha=0.11755624524603275, colsample_bytree=0.9466451743784392, learning_rate=0.12700818692912708, max_depth=9, n_estimators=304; total time=   5.1s
[CV] END alpha=0.11755624524603275, colsample_bytree

[CV] END alpha=0.06294481166065787, colsample_bytree=0.881292508408623, learning_rate=0.1366288116432643, max_depth=8, n_estimators=254; total time=   3.6s
[CV] END alpha=0.1360167568098041, colsample_bytree=0.8541724737351143, learning_rate=0.07317394758730364, max_depth=8, n_estimators=265; total time=   4.9s
[CV] END alpha=0.1360167568098041, colsample_bytree=0.8541724737351143, learning_rate=0.07317394758730364, max_depth=8, n_estimators=265; total time=   4.9s
[CV] END alpha=0.1360167568098041, colsample_bytree=0.8541724737351143, learning_rate=0.07317394758730364, max_depth=8, n_estimators=265; total time=   4.9s
[CV] END alpha=0.1360167568098041, colsample_bytree=0.8541724737351143, learning_rate=0.07317394758730364, max_depth=8, n_estimators=265; total time=   4.7s
[CV] END alpha=0.1360167568098041, colsample_bytree=0.8541724737351143, learning_rate=0.07317394758730364, max_depth=8, n_estimators=265; total time=   4.7s
[CV] END alpha=0.06532704592520155, colsample_bytree=0.9643

In [348]:
clf_TOS.best_score_

0.8802150537634409

F1-score = 0.89

In [334]:
clf_TOS.best_estimator_

Avec : max_depth = 8, alpha = 0.117, colsample_bytree = 0.83, learning_rate = 0.053, n_estimators = 261



### Scores sur les données de test

In [349]:
test_pred_proba = clf_TOS.predict_proba(X_test)[:, 1]
test_pred = (test_pred_proba > 1/2).astype('int')

print("ROC AUC :", roc_auc_score(y_test, test_pred))
print("F1 Score :", f1_score(y_test, test_pred))

ROC AUC : 0.925
F1 Score : 0.9189189189189189
