In [45]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder, WOEEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline

from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from mlxtend.evaluate import feature_importance_permutation
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectFromModel

import gc

import warnings
warnings.filterwarnings('ignore')

In [18]:
df = pd.read_csv('train_transaction.csv')

In [19]:
# I am replacing the missing values with the mode
for c in df.columns.tolist():
    df[c].fillna(df[c].mode()[0], inplace=True)

In [20]:
# Creating a list of all the column names and defining the X and y variables
used_cols = [c for c in df.columns.tolist() if c not in 'isFraud']
X = df[used_cols]
y = df['isFraud']

In [21]:
def stringify(data):
    df = pd.DataFrame(data)
    for c in df.columns.tolist():
        df[c] = df[c].astype(str)
    return df

binner = KBinsDiscretizer(n_bins=10, encode='ordinal')
objectify = FunctionTransformer(func=stringify, validate=False, check_inverse=False)
encoder = TargetEncoder(drop_invariant=True)

In [22]:
# In the following cells I am going to define the transformation strategy for categorical and numerical features
categorical = X.select_dtypes('object').columns.tolist()
non_numeric_transformer = Pipeline(steps=[#('imputer', imputer),#
                                          ('objectify', objectify),
                                            ('encoder', encoder)])


In [23]:
numeric = [c for c in used_cols if c not in categorical]

In [24]:
numeric_transformer = Pipeline(steps=[
                                        ('binner', binner),
                                       ('objectify', objectify),
                                       ('encoder', encoder)])

In [25]:
clf = XGBClassifier(n_jobs=-1, max_depth=15)

In [27]:
# This preprocessor, applies the transformation to the numeric and non_numeric columns.
preprocessor = ColumnTransformer(transformers=[('non_numeric', non_numeric_transformer, categorical),
                                              ('numeric', numeric_transformer, numeric)])

In [34]:
scorecard = make_pipeline(preprocessor, clf)

In [36]:
scorecard.fit(X,y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('non_numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('objectify',
                                                                   FunctionTransformer(accept_sparse=False,
                                                                                       check_inverse=False,
                                                                                       func=<function stringify at 0x000001D208059AE8>,
                                                                                       inv_kw_args=None,
                                                                                  

In [80]:
scores = cross_val_score(scorecard, X, y, cv=3, scoring='roc_auc', n_jobs=-1)
print(scores.mean(), "+/-", scores.std())

0.8723958333333334 +/- 0.14855167392657978


In [102]:
# These last few steps I am importing the test dataset
# using the trained model to make the predictions
# Preparing a csv file to import it into Kaggle

df_test = pd.read_csv('test_transaction.csv')



In [108]:
for c in df_test.columns.tolist():
    df_test[c].fillna(df_test[c].mode()[0], inplace=True)
y_pred_test = scorecard.predict(df_test)

In [109]:
df_sub = pd.DataFrame()
df_sub['TransactionID'] = df_test['TransactionID']
df_sub['isFraud'] = pd.Series(y_pred_test)
df_sub.to_csv('kaggle_test.csv', index=False)

In [61]:
## Feature selection

df = df.sample(2000)

In [62]:
for c in df.columns.tolist():
    df[c].fillna(df[c].mode()[0], inplace=True)

In [64]:
# Creating a list of all the column names and defining the X and y variables
used_cols = [c for c in df.columns.tolist() if c not in 'isFraud']
X = df[used_cols]
y = df['isFraud']

In [65]:
encoder = TargetEncoder()
X = encoder.fit_transform(X,y)

In [66]:
rf = RandomForestClassifier()

In [67]:
feat_sel = SelectFromModel(rf, threshold='mean')
feat_sel.fit_transform(X, y)

array([[3.37888300e+06, 9.83601300e+06, 1.00000000e+01, ...,
        6.00000000e+01, 0.00000000e+00, 1.20000000e+02],
       [3.01917700e+06, 7.86674000e+05, 7.39500000e+01, ...,
        0.00000000e+00, 1.55949997e+02, 0.00000000e+00],
       [3.35258100e+06, 9.07172800e+06, 5.79500000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 5.79500008e+01],
       ...,
       [3.38804100e+06, 1.01082920e+07, 3.67960000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.56425800e+06, 1.53583450e+07, 4.90000000e+01, ...,
        2.71000000e+02, 0.00000000e+00, 0.00000000e+00],
       [3.43506200e+06, 1.14061190e+07, 2.59500000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [73]:
print(np.array(df.columns.tolist()[:-1])[feat_sel.get_support()])

['TransactionID' 'isFraud' 'TransactionDT' 'TransactionAmt' 'ProductCD'
 'card1' 'card2' 'card4' 'card5' 'card6' 'addr2' 'dist2' 'R_emaildomain'
 'C1' 'C5' 'C6' 'C7' 'C9' 'C10' 'C11' 'C12' 'C13' 'C14' 'D1' 'D2' 'D3'
 'D4' 'D6' 'D7' 'D8' 'D11' 'D14' 'V6' 'V11' 'V12' 'V16' 'V22' 'V23' 'V24'
 'V32' 'V36' 'V38' 'V43' 'V44' 'V51' 'V52' 'V53' 'V59' 'V61' 'V65' 'V68'
 'V72' 'V74' 'V76' 'V79' 'V81' 'V85' 'V86' 'V95' 'V96' 'V123' 'V126'
 'V128' 'V129' 'V131' 'V133' 'V139' 'V152' 'V156' 'V168' 'V169' 'V185'
 'V198' 'V199' 'V200' 'V209' 'V211' 'V228' 'V238' 'V242' 'V245' 'V247'
 'V248' 'V252' 'V254' 'V256' 'V257' 'V264' 'V279' 'V281' 'V282' 'V288'
 'V292' 'V294' 'V301' 'V306' 'V307' 'V311' 'V312' 'V319']


In [74]:
len(np.array(df.columns.tolist()[:-1])[feat_sel.get_support()])

100