# IEEE-CIS FRAUD DETECTION

IEEE is the world's largest technical professional organization dedicated to advancing technology for the benefit of humanity. They work accross work accross several areas in AI and machine learning, including deep neural networks, fuzzy systems, evolutionary computation and swarm intelligence.

IEEE has partnered with Vesta Corporation, which is the leading payment service company to see the best solutions for fraud prevention in the industry. Vesta has invited the public to participate.

The goal of this project is to improve the efficacy of fraudulent transaction alerts for millions of people aroung the world, help businesses reduce their fraud losses and increase their revenues, and last but not least, improve the customer experience by reducing the insult rate (false positives).

In [2]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder, WOEEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from mlxtend.evaluate import feature_importance_permutation
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.feature_selection import SelectFromModel

import gc

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('train_transaction.csv')

In [4]:
# these are the columns that I got from SelectFromModel using the random forest classifier
to_keep = ['TransactionID','isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD',
 'card1', 'card2', 'card4', 'card5', 'card6', 'addr2', 'dist2', 'R_emaildomain',
 'C1', 'C5', 'C6', 'C7', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3',
 'D4', 'D6', 'D7', 'D8', 'D11', 'D14', 'V6', 'V11', 'V12', 'V16', 'V22', 'V23', 'V24',
 'V32', 'V36', 'V38', 'V43', 'V44', 'V51', 'V52', 'V53', 'V59', 'V61', 'V65', 'V68',
 'V72', 'V74', 'V76', 'V79', 'V81', 'V85', 'V86', 'V95', 'V96', 'V123', 'V126',
 'V128', 'V129', 'V131', 'V133', 'V139', 'V152', 'V156', 'V168', 'V169', 'V185',
 'V198', 'V199', 'V200', 'V209', 'V211', 'V228', 'V238', 'V242', 'V245', 'V247',
 'V248', 'V252', 'V254', 'V256', 'V257', 'V264', 'V279', 'V281', 'V282', 'V288',
 'V292', 'V294', 'V301', 'V306', 'V307', 'V311', 'V312', 'V319']

In [5]:
# dropping all the columns that were not selected using the SelectFromModel
for c in df.columns.tolist():
    if c not in to_keep:
        df.drop(columns=c, inplace=True)

In [6]:
# I am replacing the missing values with the mode
for c in df.columns.tolist():
    df[c].fillna(df[c].mode()[0], inplace=True)

In [7]:
used_cols = [c for c in df.columns.tolist() if c not in 'isFraud']
X = df[used_cols]
y = df['isFraud']

In [8]:
def stringify(data):
    df = pd.DataFrame(data)
    for c in df.columns.tolist():
        df[c] = df[c].astype(str)
    return df

binner = KBinsDiscretizer(n_bins=10, encode='ordinal')
objectify = FunctionTransformer(func=stringify, validate=False, check_inverse=False)
encoder = TargetEncoder(drop_invariant=True)

In [9]:
# In the following cells I am going to define the transformation strategy for categorical and numerical features
categorical = X.select_dtypes('object').columns.tolist()
non_numeric_transformer = Pipeline(steps=[#('imputer', imputer),#
                                          ('objectify', objectify),
                                            ('encoder', encoder)])

In [10]:
numeric = [c for c in used_cols if c not in categorical]

In [11]:
numeric_transformer = Pipeline(steps=[
                                        ('binner', binner),
                                       ('objectify', objectify),
                                       ('encoder', encoder)])

In [12]:
clf = imb_xgb(special_objective='weighted', imbalance_alpha=2)

In [14]:
# This preprocessor, applies the transformation to the numeric and non_numeric columns.
preprocessor = ColumnTransformer(transformers=[('non_numeric', non_numeric_transformer, categorical),
                                              ('numeric', numeric_transformer, numeric)])

In [15]:
scorecard = make_pipeline(preprocessor, clf)

In [18]:
scorecard.fit(X,y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('non_numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('objectify',
                                                                   FunctionTransformer(accept_sparse=False,
                                                                                       check_inverse=False,
                                                                                       func=<function stringify at 0x00000154429C8048>,
                                                                                       inv_kw_args=None,
                                                                                  

In [20]:
df_test = pd.read_csv('test_transaction.csv')

In [21]:
for c in df_test.columns.tolist():
    df_test[c].fillna(df_test[c].mode()[0], inplace=True)
y_pred_test = scorecard.predict(df_test)

In [22]:
df_sub = pd.DataFrame()
df_sub['TransactionID'] = df_test['TransactionID']
df_sub['isFraud'] = pd.Series(y_pred_test)
df_sub.to_csv('kaggle_test.csv', index=False)

In [None]:
# Using these features gives me the same results as just the first few features that I had initially selected
# Using the imbalanced XGB bumped the score from 71 to 90% on the leaderboard