In [14]:
import pandas as pd
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df = pd.read_csv('./fraude_tc.csv',
                 dtype={
                     'repeat_retailer':int,
                     'used_chip':int,
                     'used_pin_number':int,
                     'online_order':int,
                     'fraud':int
                 }
)

In [3]:
target = df.fraud
df = df.drop('fraud', axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=420)

In [5]:
xgbc = XGBClassifier(random_state=420)
xgbc.fit(X_train, y_train)
y_preds_xgbc = xgbc.predict(X_test)

In [6]:
accuracy_score(y_test, y_preds_xgbc)

0.999985

In [7]:
confusion_matrix(y_test, y_preds_xgbc)

array([[182574,      0],
       [     3,  17423]], dtype=int64)

In [16]:
# SMOTE and RandomUnderSampler
smote = SMOTE(sampling_strategy=0.1, random_state=420)
under_sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=420)
resampler_pipeline = make_pipeline(smote, under_sampler)

In [17]:
X_train_res, y_train_res = resampler_pipeline.fit_resample(X_train, y_train)

In [18]:
xgbc_smote = XGBClassifier(random_state=420)
xgbc_smote.fit(X_train_res, y_train_res)
y_preds_xgbc_smote = xgbc_smote.predict(X_test)

In [19]:
accuracy_score(y_test, y_preds_xgbc_smote)

0.99995

In [20]:
confusion_matrix(y_test, y_preds_xgbc_smote)

array([[182564,     10],
       [     0,  17426]], dtype=int64)

0 FN, all fraudulent transactions have been spotted.