In [236]:
import pandas as pd
import numpy as np
from os.path import exists
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [237]:
# Import files

#Training data
training_file = 'b765dc3d8076-trainset+(1).xlsx'
if exists(training_file):
    training_data = pd.read_excel('b765dc3d8076-trainset+(1).xlsx') 
else:
    training_data = pd.read_excel('https://d18qs7yq39787j.cloudfront.net/uploads/contestfile/479/b765dc3d8076-trainset+%281%29.xlsx')

# Testing data
testing_file = 'b765dc3d8076-testset_for_participants.xlsx'   
if exists(testing_file):
    testing_data = pd.read_excel('b765dc3d8076-testset_for_participants.xlsx')
else:
    testing_data = pd.read_excel('https://d18qs7yq39787j.cloudfront.net/uploads/contestfile/479/b765dc3d8076-testset_for_participants.xlsx')
data = [training_data, testing_data]

# I. Data Processing

## 1. Missing value imputation

In [238]:
for df in data:
    # CARR_NAME，RGN_NAME，STATE_PRVNC_TXT
    for i in ['CARR_NAME','RGN_NAME','STATE_PRVNC_TXT']:
        df[i].fillna('unknown', inplace = True)
    
    # DVC_TYPE_TXT & AUTHC_SCNDRY_STAT_TXT
    authc_data = df[['ALERT_TRGR_CD','DVC_TYPE_TXT','AUTHC_PRIM_TYPE_CD','AUTHC_SCNDRY_STAT_TXT']]
    cat_feature = list(authc_data.columns)
    enc_authc_data = authc_data.copy()
    for i in cat_feature:
        enc = LabelEncoder()
        enc.fit(list(enc_authc_data[i].values));
        enc_authc_data[i] = enc.transform(list(enc_authc_data[i].values))
    enc_authc_data['DVC_TYPE_TXT'].loc[enc_authc_data['DVC_TYPE_TXT'] == 4] = np.nan
    enc_authc_data['AUTHC_SCNDRY_STAT_TXT'].loc[enc_authc_data['AUTHC_SCNDRY_STAT_TXT'] == 3] = np.nan
    #imput enc_authc_data
    imputer = KNNImputer(n_neighbors=2)
    imputed = pd.DataFrame(imputer.fit_transform(enc_authc_data))
    imputed = imputed.rename(columns = {0:'ALERT_TRGR_CD',
                                        1:'DVC_TYPE_TXT',
                                        2:'AUTHC_PRIM_TYPE_CD',
                                        3:'AUTHC_SCNDRY_STAT_TXT'})
    #after imputation, DVC_TYPE_TXT has new value 0.5&1.5, whic isn't in original data, repalce 0.5&1.5with 1&2 
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 0.5] = 1
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 1.5] = 2

    imputed = imputed.astype(int)
    #mapping back to real info
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 0] = 'DESKTOP'
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 1] = 'MOBILE'
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 2] = 'PHONE'
    imputed['DVC_TYPE_TXT'].loc[imputed['DVC_TYPE_TXT'] == 3] = 'TABLET'
    imputed['AUTHC_SCNDRY_STAT_TXT'].loc[imputed['AUTHC_SCNDRY_STAT_TXT'] == 0] = 'ALLOW'
    imputed['AUTHC_SCNDRY_STAT_TXT'].loc[imputed['AUTHC_SCNDRY_STAT_TXT'] == 1] = 'CHALLENGE_ISSUED'
    imputed['AUTHC_SCNDRY_STAT_TXT'].loc[imputed['AUTHC_SCNDRY_STAT_TXT'] == 2] = 'CHALLENGE_SUCCESS'
    df['DVC_TYPE_TXT'] = imputed['DVC_TYPE_TXT']
    df['AUTHC_SCNDRY_STAT_TXT'] = imputed['AUTHC_SCNDRY_STAT_TXT']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## 2. Handle Datetime

In [239]:
for df in data:
    # change CUST_SINCE_DT to string
    df['CUST_SINCE_DT'] = df['CUST_SINCE_DT'].astype(str)
    # split year, month, day for TRAN_DT, ACTVY_DT
    for i in ['TRAN_DT', 'ACTVY_DT']:
        df[[i+'_month',i+'_day', i+'_year']] = df[i].str.split('/', expand = True)
    # there are some TRAN_DT & ACTVY_DT with different format:2021-8-2 12:00:00 AM, transfer to correct format
    wrong_format_list = df[df['ACTVY_DT_month'].isnull()].index.tolist()
    for i in wrong_format_list:
        df['TRAN_DT'][i] = df['TRAN_DT'][i].strftime('%m/%d/%Y')
        df['ACTVY_DT'][i] = df['ACTVY_DT'][i].strftime('%m/%d/%Y')
        df['TRAN_DT_month'][i] = df['TRAN_DT'][i].split('/')[0]
        df['TRAN_DT_day'][i] = df['TRAN_DT'][i].split('/')[1]
        df['TRAN_DT_year'][i] = df['TRAN_DT'][i].split('/')[2]
        df['ACTVY_DT_month'][i] = df['ACTVY_DT'][i].split('/')[0]
        df['ACTVY_DT_day'][i] = df['ACTVY_DT'][i].split('/')[1]
        df['ACTVY_DT_year'][i] = df['ACTVY_DT'][i].split('/')[2]
    # split year, month, day for CUST_SINCE_DT
    df[['CUST_SINCE_DT'+'_year', 
        'CUST_SINCE_DT'+'_month', 
        'CUST_SINCE_DT'+'_day']] = df['CUST_SINCE_DT'].str.split('-', expand = True)

    # split date and time of PWD_UPDT_TS, TRAN_TS
    for i in ['PWD_UPDT_TS', 'TRAN_TS']:
        df[[i+'_date', i+'_time']] = df[i].str.split(' ', expand = True)
    # split year, month date for PWD_UPDT_TS, TRAN_TS
    for i in ['PWD_UPDT_TS_date', 'TRAN_TS_date']:
        df[[i+'_month', i+'_day', i+'_year']] = df[i].str.split('/', expand = True)
    # split hour, minute, and second for PWD_UPDT_TS, TRAN_TS
    for i in ['PWD_UPDT_TS_time', 'TRAN_TS_time']:
        df[[i+'_hr', i+'_min', i+'_sec']] = df[i].str.split(':', expand = True)
    # there are some TRAN_TS with different format:2021-8-2 12:00:00 AM, transfer to correct format
    wrong_format_list2 = df[df['TRAN_TS_date_month'].isnull()].index.tolist()
    for i in wrong_format_list2:
        df['TRAN_TS'][i] = df['TRAN_TS'][i].strftime('%m/%d/%Y %H:%M:%S')
        df['TRAN_TS_date'][i] = df['TRAN_TS'][i].split(' ')[0]
        df['TRAN_TS_time'][i] = df['TRAN_TS'][i].split(' ')[1]
        df['TRAN_TS_date_month'][i] = df['TRAN_TS_date'][i].split('/')[0]
        df['TRAN_TS_date_day'][i] = df['TRAN_TS_date'][i].split('/')[1]
        df['TRAN_TS_date_year'][i] = df['TRAN_TS_date'][i].split('/')[2]
        df['TRAN_TS_time_hr'][i] = df['TRAN_TS_time'][i].split(':')[0]
        df['TRAN_TS_time_min'][i] = df['TRAN_TS_time'][i].split(':')[1]
        df['TRAN_TS_time_sec'][i] = df['TRAN_TS_time'][i].split(':')[2]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRAN_DT'][i] = df['TRAN_DT'][i].strftime('%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ACTVY_DT'][i] = df['ACTVY_DT'][i].strftime('%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRAN_DT_month'][i] = df['TRAN_DT'][i].split('/')[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [240]:
# Rename some colums for easier understanding
rename_dic = {'PWD_UPDT_TS_date_year':'PWD_UPDT_TS_year',
              'PWD_UPDT_TS_date_month':'PWD_UPDT_TS_month',
              'PWD_UPDT_TS_date_day':'PWD_UPDT_TS_day',
              'TRAN_TS_date_year':'TRAN_TS_year',
              'TRAN_TS_date_month':'TRAN_TS_month',
              'TRAN_TS_date_day':'TRAN_TS_day',
              'PWD_UPDT_TS_time_hr':'PWD_UPDT_TS_hr',
              'PWD_UPDT_TS_time_min':'PWD_UPDT_TS_min',
              'PWD_UPDT_TS_time_sec':'PWD_UPDT_TS_sec',
              'TRAN_TS_time_hr':'TRAN_TS_hr',
              'TRAN_TS_time_min':'TRAN_TS_min',
              'TRAN_TS_time_sec':'TRAN_TS_sec'}
    
testing_data = testing_data.rename(columns = rename_dic)
training_data = training_data.rename(columns = rename_dic)

#deleted PWD_UPDT_TS_date & TRAN_TS_date first slipt columns
del testing_data['PWD_UPDT_TS_date']
del testing_data['PWD_UPDT_TS_time']
del testing_data['TRAN_TS_date']
del testing_data['TRAN_TS_time']

del training_data['PWD_UPDT_TS_date']
del training_data['PWD_UPDT_TS_time']
del training_data['TRAN_TS_date']
del training_data['TRAN_TS_time']

## 3. Drop Useless Features

In [241]:
delete_col = ['ACTVY_DT_year', 
              'ACTVY_DT_month',
              'ACTVY_DT_day',
              'TRAN_TS_year',
              'CUST_STATE',
              'PH_NUM_UPDT_TS',
              'PWD_UPDT_TS',
              'PWD_UPDT_TS_month',
              'PWD_UPDT_TS_day',
              'PWD_UPDT_TS_year',
              'PWD_UPDT_TS_hr',
              'PWD_UPDT_TS_min',
              'PWD_UPDT_TS_sec',
              'TRAN_DT',
              'TRAN_DT_year', 
              'TRAN_DT_month',
              'TRAN_DT_day',
              'CUST_SINCE_DT',
              'TRAN_TS',
              'ACTVY_DT',
              'CUST_ZIP',
              'ACTN_CD',
              'ACTN_INTNL_TXT',
              'TRAN_TYPE_CD']
for col in delete_col:
    del training_data[col]
    del testing_data[col]

## 3. Change year, month, day, hour, minute, second to numerical

In [242]:
astype_col = ['CUST_SINCE_DT_year', 'CUST_SINCE_DT_month', 'CUST_SINCE_DT_day',
              'TRAN_TS_month', 'TRAN_TS_day', 
              'TRAN_TS_hr', 'TRAN_TS_min', 'TRAN_TS_sec']

testing_data[astype_col] = testing_data[astype_col].astype('int64')
training_data[astype_col] = training_data[astype_col].astype('int64')

## 4. Set dataset_id as index for testing data



In [243]:
testing_data = testing_data.set_index('dataset_id')

## 5. Change target value to binary

In [244]:
# change target value to binary
training_data['FRAUD_NONFRAUD'] = np.where(training_data['FRAUD_NONFRAUD']=='Non-Fraud', 1, 0)

## 6. Encoding

1. CARR_NAME has 554 distinct categories, and STATE_PRVNC_TXT has 127 distinct categories. If one-hot encoding, will created a highly sparsed dataset. Therefore, we will binary encoding these two featurs.

2. For the rest of categorical features:RGN_NAME, ALERT_TRGR_CD, DVC_TYPE_TXT, AUTHC_PRIM_TYPE_CD, and AUTHC_SCNDRY_STAT_TXT, we will use one hot encoding technique. 

### (1). Binary Encoding

In [245]:
binary_enc = ce.BinaryEncoder(cols=['CARR_NAME', 'STATE_PRVNC_TXT'],return_df=True)
features = training_data.drop(['FRAUD_NONFRAUD'], axis=1)
encoded_train_data = binary_enc.fit_transform(features)
encoded_test_data = binary_enc.transform(testing_data)

  elif pd.api.types.is_categorical(cols):


### (2). One-Hot-Encoding

In [246]:
ohe_enc = ce.OneHotEncoder(cols=['RGN_NAME', 
                                 'ALERT_TRGR_CD', 
                                 'DVC_TYPE_TXT', 
                                 'AUTHC_PRIM_TYPE_CD', 
                                 'AUTHC_SCNDRY_STAT_TXT'], 
                           return_df=True)

encoded_train_data = ohe_enc.fit_transform(encoded_train_data)
encoded_test_data = ohe_enc.transform(encoded_test_data)

# II. Modeling

## 1. Training the Models

In [247]:
X = encoded_train_data
y = training_data['FRAUD_NONFRAUD'].copy()

In [248]:
# Split traning and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

In [249]:
# Standardize the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### (1). Ridge Regression

In [250]:
linridge = Ridge(alpha=1).fit(X_train_scaled, y_train)
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))

R-squared score (training): 0.408
R-squared score (test): 0.438


### (2). Lasso Regression

In [251]:
linlasso = Lasso(alpha=1.0).fit(X_train_scaled, y_train)
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'
     .format(linlasso.score(X_test_scaled, y_test)))

R-squared score (training): 0.000
R-squared score (test): -0.000



### (3). Logistic Regression

In [252]:
clf = LogisticRegression().fit(X_train, y_train)
clf_predict = clf.predict(X_test)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

print(classification_report(y_test, clf_predict, target_names=['Fraud', 'NonFraud']))

Accuracy of Logistic regression classifier on training set: 0.74
Accuracy of Logistic regression classifier on test set: 0.74
              precision    recall  f1-score   support

       Fraud       0.63      0.30      0.40      1054
    NonFraud       0.75      0.93      0.83      2446

    accuracy                           0.74      3500
   macro avg       0.69      0.61      0.62      3500
weighted avg       0.72      0.74      0.70      3500



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### (4). Dummy Classifier

In [253]:
clf_dummy = DummyClassifier(strategy='stratified').fit(X_train, y_train)
clf_dummy_pred = clf_dummy.predict(X_test)

print('Accuracy of Dummy classifier on training set: {:.2f}'
     .format(clf_dummy.score(X_train, y_train)))
print('Accuracy of Dummy classifier on test set: {:.2f}'
     .format(clf_dummy.score(X_test, y_test)))

print(classification_report(y_test, clf_dummy_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of Dummy classifier on training set: 0.58
Accuracy of Dummy classifier on test set: 0.60
              precision    recall  f1-score   support

       Fraud       0.32      0.32      0.32      1054
    NonFraud       0.71      0.71      0.71      2446

    accuracy                           0.59      3500
   macro avg       0.52      0.52      0.52      3500
weighted avg       0.59      0.59      0.59      3500



### (5). KNN

In [254]:
clf_knn = KNeighborsClassifier().fit(X_train, y_train)
clf_knn_pred = clf_knn.predict(X_test)


print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(clf_knn.score(X_train, y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(clf_knn.score(X_test, y_test)))

print(classification_report(y_test, clf_knn_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of KNN classifier on training set: 0.90
Accuracy of KNN classifier on test set: 0.87
              precision    recall  f1-score   support

       Fraud       0.80      0.77      0.78      1054
    NonFraud       0.90      0.92      0.91      2446

    accuracy                           0.87      3500
   macro avg       0.85      0.84      0.85      3500
weighted avg       0.87      0.87      0.87      3500



### (6). SVM

In [255]:
clf_svm = SVC().fit(X_train_scaled, y_train)
clf_svm_pred = clf_svm.predict(X_test_scaled)

print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(clf_svm.score(X_train_scaled, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(clf_svm.score(X_test_scaled, y_test)))

print(classification_report(y_test, clf_svm_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of SVM classifier on training set: 0.91
Accuracy of SVM classifier on test set: 0.87
              precision    recall  f1-score   support

       Fraud       0.84      0.72      0.78      1054
    NonFraud       0.89      0.94      0.91      2446

    accuracy                           0.87      3500
   macro avg       0.86      0.83      0.84      3500
weighted avg       0.87      0.87      0.87      3500



### (7). Decision Tree

In [256]:
clf_DT = DecisionTreeClassifier().fit(X_train, y_train)
clf_DT_pred = clf_DT.predict(X_test)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf_DT.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf_DT.score(X_test, y_test)))
print(classification_report(y_test, clf_DT_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.92
              precision    recall  f1-score   support

       Fraud       0.87      0.87      0.87      1054
    NonFraud       0.95      0.95      0.95      2446

    accuracy                           0.92      3500
   macro avg       0.91      0.91      0.91      3500
weighted avg       0.92      0.92      0.92      3500



### (8). Random Forest

In [257]:
clf_RF = RandomForestClassifier().fit(X_train, y_train)
clf_RF_pred = clf_RF.predict(X_test)

print('Accuracy of Random Forest classifier on training set: {:.2f}'
     .format(clf_RF.score(X_train, y_train)))
print('Accuracy of Random Forest classifier on test set: {:.2f}'
     .format(clf_RF.score(X_test, y_test)))
print(classification_report(y_test, clf_RF_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of Random Forest classifier on training set: 1.00
Accuracy of Random Forest classifier on test set: 0.95
              precision    recall  f1-score   support

       Fraud       0.94      0.88      0.91      1054
    NonFraud       0.95      0.98      0.96      2446

    accuracy                           0.95      3500
   macro avg       0.95      0.93      0.94      3500
weighted avg       0.95      0.95      0.95      3500



### (9). Gradient Boosting

In [258]:
clf_GB = GradientBoostingClassifier().fit(X_train, y_train)
clf_GB_pred = clf_GB.predict(X_test)

print('Accuracy of Gradient Boosting classifier on training set: {:.2f}'
     .format(clf_GB.score(X_train, y_train)))
print('Accuracy of Gradient Boosting classifier on test set: {:.2f}'
     .format(clf_GB.score(X_test, y_test)))
print(classification_report(y_test, clf_GB_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of Gradient Boosting classifier on training set: 0.96
Accuracy of Gradient Boosting classifier on test set: 0.96
              precision    recall  f1-score   support

       Fraud       0.95      0.91      0.93      1054
    NonFraud       0.96      0.98      0.97      2446

    accuracy                           0.96      3500
   macro avg       0.96      0.94      0.95      3500
weighted avg       0.96      0.96      0.96      3500



### (10). AdaBoosting

In [259]:
clf_ADB = AdaBoostClassifier().fit(X_train, y_train)
clf_ADB_pred = clf_ADB.predict(X_test)

print('Accuracy of AdaBoosting classifier on training set: {:.2f}'
     .format(clf_ADB.score(X_train, y_train)))
print('Accuracy of AdaBoosting classifier on test set: {:.2f}'
     .format(clf_ADB.score(X_test, y_test)))
print(classification_report(y_test, clf_XGB_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of AdaBoosting classifier on training set: 0.95
Accuracy of AdaBoosting classifier on test set: 0.96
              precision    recall  f1-score   support

       Fraud       0.96      0.94      0.95      1054
    NonFraud       0.97      0.98      0.98      2446

    accuracy                           0.97      3500
   macro avg       0.97      0.96      0.96      3500
weighted avg       0.97      0.97      0.97      3500



### (11). XGBoosting

In [260]:
clf_XGB = xgb.XGBClassifier(use_label_encoder=False, max_depth = 3).fit(X_train, y_train)
clf_XGB_pred = clf_XGB.predict(X_test)

print('Accuracy of XGBoosting classifier on training set: {:.2f}'
     .format(clf_XGB.score(X_train, y_train)))
print('Accuracy of XGBoosting classifier on test set: {:.2f}'
     .format(clf_XGB.score(X_test, y_test)))
print(classification_report(y_test, clf_XGB_pred, target_names=['Fraud', 'NonFraud']))

Accuracy of XGBoosting classifier on training set: 0.98
Accuracy of XGBoosting classifier on test set: 0.97
              precision    recall  f1-score   support

       Fraud       0.96      0.94      0.95      1054
    NonFraud       0.97      0.98      0.98      2446

    accuracy                           0.97      3500
   macro avg       0.97      0.96      0.96      3500
weighted avg       0.97      0.97      0.97      3500



## Parameter Tuning

In [261]:
parameters = {'learning_rate': [0.01, 0.02, 0.03, 0.04],
              'max_depth': [3, 5, 7, 9],
              'min_child_weight': [3, 5, 7, 9, 11, 20],
              'subsample': [0.5, 0.7, 1.0],
              'colsample_bytree': [0.5],
              'n_estimators' : [100, 140, 165, 200, 220, 250]
             }
def param_tunning():
    model = xgb.XGBClassifier()
    clf = GridSearchCV(estimator=model, 
                       param_grid=parameters,
                       cv=5,
                       n_jobs=-1,
                       verbose=1,
                       use_label_encoder=False)
    best_model = clf.fit(X_train, y_train)
    return best_model.best_params_

In [262]:
best_paramter = param_tunning()

In [263]:
print(best_paramter)

{'colsample_bytree': 0.5, 'learning_rate': 0.04, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 250, 'subsample': 1.0}


## Tuned Model Re-training

In [270]:
tuned_xgb = xgb.XGBClassifier(colsample_bytree=0.5,
                              learning_rate=0.04,
                              max_depth=9,
                              min_child_weight=5,
                              n_estimators=250,
                              subsample=1,
                              use_label_encoder=False).fit(X_train, y_train)

tuned_xgb_pred = tuned_xgb.predict(X_test)

print('Accuracy of XGBoosting classifier on training set: {:.2f}'
     .format(tuned_xgb.score(X_train, y_train)))
print('Accuracy of XGBoosting classifier on test set: {:.2f}'
     .format(tuned_xgb.score(X_test, y_test)))
print(classification_report(y_test, tuned_xgb_pred, target_names=['Fraud', 'NonFraud']))


Accuracy of XGBoosting classifier on training set: 0.99
Accuracy of XGBoosting classifier on test set: 0.97
              precision    recall  f1-score   support

       Fraud       0.97      0.93      0.95      1054
    NonFraud       0.97      0.99      0.98      2446

    accuracy                           0.97      3500
   macro avg       0.97      0.96      0.96      3500
weighted avg       0.97      0.97      0.97      3500



# III. Predicting

In [265]:
# Make prediction using tunned XGB Model
FRAUD_NONFRAUD = tuned_xgb.predict(encoded_test_data)

In [266]:
# Add the prediction to current dataset
testing_data['FRAUD_NONFRAUD'] = FRAUD_NONFRAUD

In [267]:
# Export result
result = testing_data['FRAUD_NONFRAUD'].copy()
result.to_csv("Result.csv")