In [13]:
import gc
import os
import warnings
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, precision_score

import h2o
from h2o.automl import H2OAutoML

h2o.init()

Load data as a dataframe

In [2]:
X = pd.read_csv("../input/frauddatawithoutaggregations/train_set_without_id_features.csv")

Remove specific identifer Columns and Categorical Columns

In [3]:
X.drop(list(X.dtypes[X.dtypes == "object"].keys()), axis=1, inplace=True)
X.drop('Unnamed: 0', axis=1, inplace=True)
X.sort_values(by=['TransactionDT'], inplace=True)
X.drop(['TransactionID','TransactionDT'], axis=1, inplace=True)

In [4]:
y = X['isFraud'].copy()
y = y.to_frame()
X = X.drop('isFraud', axis=1)
X = X.fillna(-999)

In [5]:
splitpoint = int(0.8 * X.shape[0])
print(splitpoint)

In [6]:
X_train, X_test, y_train, y_test = X[0:splitpoint], X[splitpoint:X.shape[0]], y[0:splitpoint], y[splitpoint:y.shape[0]] 

In [7]:
del X, y

In [8]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Adversarial Validation

In [9]:
X_train['dataset'] = 0
X_test['dataset'] = 1

In [10]:
X_train.shape, X_test.shape

In [11]:
df = pd.concat([X_train, X_test])

In [15]:
del X_train, y_train, X_test, y_test
x = gc.collect()

In [23]:
def automl_fit(df, ycol):
    print("Converting dataset to H2O frame...", end="")
    hf = h2o.H2OFrame(df)
    print("Done.")
    
    print("Selecting x and y columns...", end="")
    xcols = hf.columns
    xcols.remove(ycol)
    print("Done.")
    
    print("Converting y column to categorical...", end="")
    hf[ycol] = hf[ycol].asfactor()
    print("Done.")
    
    print("Training...", end="")
    aml = H2OAutoML(max_models=3, max_runtime_secs=3600)
    aml.train(x=xcols, y=ycol, training_frame=hf)
    print("Done.")
    
    return aml

In [24]:
_ = gc.collect()

In [25]:
aml = automl_fit(df, "dataset")

In [39]:
m = h2o.get_model(aml.leaderboard[3, "model_id"])
feat_imps = m.varimp(use_pandas=True)

In [40]:
feat_imps.to_csv("feature_importances_adversarial.csv")

In [48]:
feat_imps.head(30)

`P_emaildomain_fe` is quite important for the model. We will remove this, as it clearly causes overfitting. At the same time, we will remove all variables whose relative importance > 10000

In [51]:
remove_cols = ['P_emaildomain_fe', 'D15', 'DeviceInfo_fe', 'R_emaildomain_fe']

In [52]:
X = pd.read_csv("../input/frauddatawithoutaggregations/train_set_without_id_features.csv")

X.drop(list(X.dtypes[X.dtypes == "object"].keys()) + remove_cols, axis=1, inplace=True)
X.drop('Unnamed: 0', axis=1, inplace=True)
X.sort_values(by=['TransactionDT'], inplace=True)
X.drop(['TransactionID','TransactionDT'], axis=1, inplace=True)

y = X['isFraud'].copy()
y = y.to_frame()
X = X.drop('isFraud', axis=1)
X = X.fillna(-999)

In [53]:
splitpoint = int(0.8 * X.shape[0])
print(splitpoint)

X_train, X_test, y_train, y_test = X[0:splitpoint], X[splitpoint:X.shape[0]], y[0:splitpoint], y[splitpoint:y.shape[0]] 

### Modeling

In [54]:
clf = xgb.XGBClassifier(n_estimators=100,
                        n_jobs=-1,
                        max_depth=7,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        missing=-999,
                        verbosity=3,
                        xgb_model='my_xgb_model',
                       early_stopping_rounds=10)

result = clf.fit(X_train, y_train)

In [55]:
test_y = clf.predict(X_test)
test_preds = clf.predict_proba(X_test)[:,1]
print("Performance - XGBoost")

print('F1 score:')
print(f1_score(y_test, test_y))

print('Precision score:')
print(precision_score(y_test, test_y))

print('Area Under the Receiver Operating Characteristic Curve:')
print(roc_auc_score(y_test, test_preds)) # class imbalance

In [56]:
confusion_matrix(y_test, test_y)