# IEEE-CIS Fraud Detection competition

This is a starter notebook to help you with the competition submissions. Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

## Loading libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import time
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import warnings
import xgboost as xgb

# explicitly require this experimental feature
# from sklearn.experimental import enable_halving_search_cv # noqa

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

In [None]:
warnings.simplefilter("ignore")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def draw_feature_importance(clf, data, figsize=(6, 7), n_feat=30):
    importances = clf.feature_importances_
    sorted_idx = importances.argsort()
    n_feat = len(sorted_idx) - n_feat
    features = [str(f) for f in data.columns[sorted_idx]][n_feat:]
    importances = importances[sorted_idx][n_feat:]
    plt.figure(figsize=figsize)
    plt.barh(features, importances)
    plt.xlabel("Random Forest Feature Importance")
    plt.show()

## Loading data

### Load train data

In [None]:
%%time 
train_transactions=pd.read_csv('../input/train_transaction.csv')
train_identity=pd.read_csv('../input/train_identity.csv')
print('Train data set is loaded !')

In [None]:
train_transactions.head()

In [None]:
train_transactions.info()

In [None]:
train_identity.head()

In [None]:
train_identity.info()

In [None]:
sns.countplot(x=train_transactions["isFraud"]);

- There is clearly a class imbalace problem.

### Merging training data

In [None]:
train_df = train_transactions.merge(train_identity, how="left", on="TransactionID")

print('Train shape',train_df.shape)

print("Data set merged ")

del train_transactions, train_identity

### Reducing training memory usage

In [None]:
%%time
train_df = reduce_mem_usage(train_df)

In [None]:
train_df['R_emaildomain'].value_counts()

### Load test data

In [None]:
%%time 
test_transaction=pd.read_csv('../input/test_transaction.csv')
test_identity=pd.read_csv('../input/test_identity.csv')
sample_submission=pd.read_csv('../input/sample_submission.csv')
print('Test data set is loaded !')
                              

### Merging testing data

In [None]:
%%time
test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

print('Train shape',train_df.shape)

print("Data set merged ")

del test_transaction, test_identity

### Reducing testing memory usage

In [None]:
%%time
test_df = reduce_mem_usage(test_df)

In [None]:
# fix columns names
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))

# set TransactionID as index
train_df.set_index('TransactionID', inplace=True)
test_df.set_index('TransactionID', inplace=True)

In [None]:
# Save data to pickle files for faster loading
train_df.to_pickle('train_df.pkl')
test_df.to_pickle('test_df.pkl')

In [None]:
%%time
# load pickled data
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

### Replace missing values and use label encoder for categorical variables

In [None]:
%%time
# replace numeric missing values with a mean
for f in train_df.columns:
    if (train_df[f].dtype!='object') and (f!='isFraud'): 
        if (train_df[f].isna().sum()>0) or (test_df[f].isna().sum()>0):
            mean = np.mean(train_df[~(train_df[f].isna())][f].values)  # simple mean on pandas column produces NaNs
            if mean:
                print(f'{f: >10} mean: {mean: >10.3f}, n_train_missing = {train_df[f].isna().sum(): >10,}, n_test_missing = {test_df[f].isna().sum(): >10,}')
                train_df[f] = np.where(train_df[f].isna(), mean, train_df[f])
                test_df[f] = np.where(test_df[f].isna(), mean, test_df[f])

In [None]:
%%time

# Replace remaining missing values with -999
train_df = train_df.fillna(-999)
test_df = test_df.fillna(-999)

In [None]:
train_df['ProductCD'].value_counts()

In [None]:
train_df['id_38'].value_counts()

In [None]:
# All categorical columns
object_cols = [col for col in train_df.columns if train_df[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(train_df[col]) == set(test_df[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols =  list (set(object_cols) - set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: train_df[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: -x[1])

In [None]:
train_df.drop(bad_label_cols, axis=1, inplace=True)
test_df.drop(bad_label_cols, axis=1, inplace=True)

In [None]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in good_label_cols if train_df[col].nunique() < 5]

# Columns that will be label encoded
high_cardinality_cols = list(set(good_label_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be label encoded:', high_cardinality_cols)

In [None]:
%%time

# Label Encoding for categorical features
for f in high_cardinality_cols: 
    lbl_enc = LabelEncoder()
    lbl_enc.fit(list(train_df[f].values))
    print(f'{f}: {lbl_enc.classes_}')
    train_df[f] = lbl_enc.transform(list(train_df[f].values))
    test_df[f] = lbl_enc.transform(list(test_df[f].values))

In [None]:
train_df[low_cardinality_cols].head()

In [None]:
OH_encoder = OneHotEncoder(drop='first', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_df[low_cardinality_cols].astype('str')))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(test_df[low_cardinality_cols].astype('str')))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_df.index
OH_cols_valid.index = test_df.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_df.drop(low_cardinality_cols, axis=1)
num_X_valid = test_df.drop(low_cardinality_cols, axis=1)

del train_df, test_df

# Add one-hot encoded columns to numerical features
OH_train_df = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_test_df = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
del num_X_train, OH_cols_train, num_X_valid, OH_cols_valid

### Try binning (discretization)

In [None]:
# Create bins with equal or similar number of records (values)
transf = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
OH_train_df['TransactionAmt_bin'] = transf.fit_transform(OH_train_df[['TransactionAmt']])
OH_test_df['TransactionAmt_bin'] = transf.transform(OH_test_df[['TransactionAmt']])
# np.squeeze(binned_data.reshape(-1,1))

- We will now split the train dataset into train and validation sets (20% of data for validation)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(OH_train_df.drop('isFraud', axis=1), OH_train_df['isFraud'].astype('uint8'), test_size=.1, random_state=42, stratify=OH_train_df['isFraud'].astype('uint8'))

In [None]:
# Downsample majority class
df_majority_downsampled, y_majority_downsampled = resample(
    X_train[y_train == 0], 
    y_train[y_train == 0],
    replace=False,    # sample with replacement
    n_samples=3*len(y_train[y_train == 1]),   
    random_state=42)  # reproducible results

X_down_train = pd.concat([X_train[y_train == 1], df_majority_downsampled])
# print(X_down_train.shape)

y_down_train = pd.concat([y_train[y_train == 1], y_majority_downsampled])
# y_down_train.value_counts()

sns.countplot(x = y_down_train);

In [None]:
X_train_sm, y_train_sm = X_down_train, y_down_train

# # # Upsample minority class with Smote variant
# smt = SMOTE(random_state=42)
# # smt = SMOTETomek(random_state=42)
# # smt = SMOTEENN(random_state=42)
# X_train_sm, y_train_sm = smt.fit_resample(X_down_train, y_down_train)

sns.countplot(x = y_train_sm);

## Fit the model

In [None]:
clf_rf_down = RandomForestClassifier(random_state=42, n_estimators=50)
model_rf_down = clf_rf_down.fit(X_train_sm, y_train_sm)

y_prob = model_rf_down.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, y_prob):.3f}')

In [None]:
draw_feature_importance(model_rf_down, X_train_sm, figsize=(6, 9), n_feat=50)

## Prepare submission file

In [None]:
# predictions_rf = clf_rf_down.predict_proba(OH_test_df)[:,1]
# submission = pd.DataFrame({'TransactionID':OH_test_df.index,'isFraud':predictions_rf})
# submission["TransactionID"]=submission["TransactionID"].astype(int)
# # submission.head()

# filename = 'rf_model_submission2.csv'
# submission.to_csv(filename, index=False)
# print(f'Saved file: {filename}')

## Use Grid Search to find optimal xgboost parameters

In [None]:
%%time

roc_auc_scorer = make_scorer(roc_auc_score, 
                             greater_is_better=True,
                             needs_threshold=True)

cross_validation = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

clf = xgb.XGBClassifier(nthread=1, random_state=42)

param_grid = {
    'max_depth': [15], #[7, 10, 15]
    'min_samples_split': [3],
    'learning_rate': [0.1], #so called `eta` value
    'min_child_weight': [5],
    'silent': [1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'n_estimators': [500], #number of trees, change it to 1000 for better results
    'missing':[-999],
}

search = GridSearchCV(
    clf, 
    param_grid, 
    cv=cross_validation, 
    scoring=roc_auc_scorer,
    n_jobs=-1,
).fit(X_train_sm, y_train_sm)

print(search.best_params_)

y_prob = search.predict_proba(X_test)[:, 1]
print(f'ROC-AUC score: {roc_auc_score(y_test, y_prob):.3f}')

In [None]:
draw_feature_importance(search, X_train_sm, figsize=(6, 9), n_feat=50)

In [None]:
sorted_idx = np.argsort(search.feature_importances_)[::-1]

for index in sorted_idx:
    print([X_train_sm.columns[index], search.feature_importances_[index]]) 

In [None]:
plot_importance(search, max_num_features = 20)
plt.show()

In [None]:
predictions_cv_xgb = search.predict_proba(OH_test_df)[:,1]
submission = pd.DataFrame({'TransactionID':OH_test_df.index,'isFraud':predictions_cv_xgb})
submission["TransactionID"]=submission["TransactionID"].astype(int)
filename = 'cv_xgb_model_submission.csv'
submission.to_csv(filename, index=False)
print(f'Saved file: {filename}')

## Make Submission

Once you've finished your kernel and titled it, press [Save Version] > [Save & Run All (Commit)] on the top right corner of the editor screen. Wen  running your code is finished, you can go to viewer, than select data tab, where the saved files should be located. Select the relevant submission file (csv file which you saved) and press [Submit] button. Once the submission file is scored you can check the results under [My Submissions] and you will be able to see how well you did relative to the other people on the [Leaderboard].

Your [Private Score] should be better than the score of this starter notebook, which is 0.854279.