# Setting Up Environment

In [None]:
# libraries importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
# model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
from xgboost import XGBClassifier

# Load Data

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# url = '/content/drive/MyDrive/BT4012/ohe_encoded_word2vec.csv'

# import from github repo
url = 'https://raw.githubusercontent.com/LordZhiHao/BT4012_Fraud_Analytics_Project/main/ohe_encoded_word2vec.csv'

In [5]:
df = pd.read_csv(url, sep='\t', encoding='utf-8')

In [5]:
X = df.drop('fraudulent', axis=1)
y = df['fraudulent']

In [6]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## Model

In [8]:
# import the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() ,
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(),
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain, ytrain) # TO FILL IN
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain # TO FILL IN
            pred = ytrain # TO FILL IN
            title = 'Train'
        else :
            to_pred = xtest # TO FILL IN
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred)
        y_probs = model.predict_proba(to_pred)[:, 1]
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5)
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,98,72,82,64,98.0
LogisticRegression _ Test Details,97,70,84,60,95.0
KNeighborsClassifier _ Train Details,98,73,89,62,99.0
KNeighborsClassifier _ Test Details,98,73,91,61,89.0
DecisionTreeClassifier _ Train Details,100,100,100,100,100.0
DecisionTreeClassifier _ Test Details,97,71,69,73,86.0
ExtraTreeClassifier _ Train Details,100,100,100,100,100.0
ExtraTreeClassifier _ Test Details,97,64,64,64,81.0
RandomForestClassifier _ Train Details,100,100,100,100,100.0
RandomForestClassifier _ Test Details,97,62,100,45,99.0


## Use SMOTE for oversampling

In [9]:
from imblearn.over_sampling import SMOTENC # smotenc is used as it can handle categorical variable
from imblearn.over_sampling import SMOTE

# oversampling
smote = SMOTE(random_state=42)
xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)

In [10]:
# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() ,
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(),
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled) # TO FILL IN
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled # TO FILL IN
            pred = ytrain_resampled # TO FILL IN
            title = 'Train'
        else :
            to_pred = xtest # TO FILL IN
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred)
        y_probs = model.predict_proba(to_pred)[:, 1]
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5)
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,95,95,94,97,99.0
LogisticRegression _ Test Details,94,56,42,84,94.0
KNeighborsClassifier _ Train Details,98,99,97,100,100.0
KNeighborsClassifier _ Test Details,95,61,49,80,90.0
DecisionTreeClassifier _ Train Details,100,100,100,100,100.0
DecisionTreeClassifier _ Test Details,96,65,58,73,85.0
ExtraTreeClassifier _ Train Details,100,100,100,100,100.0
ExtraTreeClassifier _ Test Details,94,54,44,68,82.0
RandomForestClassifier _ Train Details,100,100,100,100,100.0
RandomForestClassifier _ Test Details,98,81,97,70,99.0


# LSTM (Before SMOTE)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.python.ops.math_ops import reduce_prod
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding,Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras import Input

numeric_cols = X_train.columns

metrics_list = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc')]

def build_bilstm():

  numeric_inputs = Input(shape=(len(numeric_cols),), name='numeric_inputs')
  num_inp=layers.Dense(128, activation='relu', name='dense_num_1',
                kernel_initializer=tf.keras.initializers.he_normal(seed=42))(numeric_inputs)
  num_inp=layers.Dropout(0.1)(num_inp)
  num_inp=layers.Dense(128, activation='relu', name='dense_num_2',
                kernel_initializer=tf.keras.initializers.he_normal(seed=42))(num_inp)
  inputs = keras.layers.concatenate([num_inp])
  x = layers.Dense(32, activation='relu', name='dense_1')(inputs)
  output=layers.Dense(1, activation='sigmoid', name='sigmoid_output')(x)

  bilstm_model = Model(inputs=[numeric_inputs], outputs=[output])
  return bilstm_model

bilstm_model = build_bilstm()
print(bilstm_model.summary())

bilstm_model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics = ['accuracy','AUC'])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 numeric_inputs (InputLayer  [(None, 560)]             0         
 )                                                               
                                                                 
 dense_num_1 (Dense)         (None, 128)               71808     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_num_2 (Dense)         (None, 128)               16512     
                                                                 
 concatenate (Concatenate)   (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                             

In [None]:
history = bilstm_model.fit([X_train],y_train, epochs=25, batch_size = 24, verbose = 1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
y_pred = bilstm_model.predict([X_test])
y_pred=np.where(y_pred>0.5,1,0)



In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def getMetrics(y_test,y_pred):
    accuracy = np.round(accuracy_score(y_test, y_pred),4)
    f1 = np.round(f1_score(y_test, y_pred),4)
    precision = np.round(precision_score(y_test, y_pred),4)
    recall = np.round(recall_score(y_test, y_pred),4)
    auroc = np.round(roc_auc_score(y_test, y_pred),4)
    return accuracy,f1,precision,recall,auroc

df_metric_list = pd.DataFrame([getMetrics(y_test,y_pred)])
df_metric_list.columns = ['accuracy','f1_score','precision','recall','auroc']
df_metric_list

Unnamed: 0,accuracy,f1_score,precision,recall,auroc
0,0.9869,0.8563,0.9091,0.8092,0.9026


# LSTM (After SMOTE)

In [None]:
history = bilstm_model.fit([X_train],y_train, epochs=25, batch_size = 24, verbose = 1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
y_pred = bilstm_model.predict([X_test])
y_pred=np.where(y_pred>0.5,1,0)



In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def getMetrics(y_test,y_pred):
    accuracy = np.round(accuracy_score(y_test, y_pred),4)
    f1 = np.round(f1_score(y_test, y_pred),4)
    precision = np.round(precision_score(y_test, y_pred),4)
    recall = np.round(recall_score(y_test, y_pred),4)
    auroc = np.round(roc_auc_score(y_test, y_pred),4)
    return accuracy,f1,precision,recall,auroc

df_metric_list = pd.DataFrame([getMetrics(y_test,y_pred)])
df_metric_list.columns = ['accuracy','f1_score','precision','recall','auroc']
df_metric_list

Unnamed: 0,accuracy,f1_score,precision,recall,auroc
0,0.9849,0.8344,0.8889,0.7861,0.8906
