# BT4012 Without Text Analysis

### Setting up the environment

In [1]:
# libraries importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import tensorflow as tf

In [2]:
# setting up
# import from github repo
url = 'https://raw.githubusercontent.com/LordZhiHao/BT4012_Fraud_Analytics_Project/main/fake_job_postings.csv'

# read data
data = pd.read_csv(url)
df = data.copy()
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


### Handling Null Values

In [3]:
# keep track of what columns to keep and drop
cols_to_keep = []
cols_to_drop = []

In [4]:
# check for NA values
df.isna().any()

job_id                 False
title                  False
location                True
department              True
salary_range            True
company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type         True
required_experience     True
required_education      True
industry                True
function                True
fraudulent             False
dtype: bool

In [5]:
# check for num of na values
df.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [6]:
# handle location and description first since easier
# handle location - split into nation and city and fillna with unknown
df['location'] = df['location'].fillna('NA, Unknown')
df['country'] = df['location'].apply(lambda x : x.strip()[:2])
df['city'] = df['location'].apply(lambda x : x.split(',')[-1])

# a lot of sparse values noted for countries, may lead to unexpected results - to handle - keep countries with counts >10 only and put others for the rest
ls_of_countries = [country if df[df['country']==country]['country'].count() >= 10 else 'Others' for country in df['country'].unique()]
df['country'] = df['country'].apply(lambda x : x if x in ls_of_countries else 'Others')

ls_of_cities = [city if df[df['city']==city]['city'].count() >= 10 else 'Others' for city in df['city'].unique()]
df['city'] = df['city'].apply(lambda x : x if x in ls_of_cities else 'Others')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('country')
cols_to_keep.append('city')
cols_to_drop.append('location')

In [7]:
# handle description - convert to binary - with or without
df['has_description'] = df['description'].apply(lambda x: 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_description')
cols_to_drop.append('description')

In [8]:
# handle department - q a lot of departments - keep only those with high count and take everything else as others
ls_of_dept = [dept if df[df['department']==dept]['department'].count() >= 10 else 'Others' for dept in df['department'].unique()]
df['has_department'] = df['department'].apply(lambda x : x if x in ls_of_dept else 'Others')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_department')
cols_to_drop.append('department')

In [9]:
# handle salary_range - q a lot of ranges - keep as binary - has_salary or not
df['has_salary'] = df['salary_range'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_salary')
cols_to_drop.append('salary_range')

In [10]:
cols_to_drop # handled these colummns

['location', 'description', 'department', 'salary_range']

In [11]:
# handle company_profile, requirements and benefits next - for simplicity - keep as binary - has or not
df['has_company_profile'] = df['company_profile'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_requirements'] = df['requirements'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_benefits'] = df['benefits'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_company_profile')
cols_to_keep.append('has_requirements')
cols_to_keep.append('has_benefits')
cols_to_drop.append('company_profile')
cols_to_drop.append('requirements')
cols_to_drop.append('benefits')

In [12]:
# handle employment_type - keep na values as unknown
df['employment_type'] = df['employment_type'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('employment_type')

In [13]:
df['required_experience'].unique()

array(['Internship', 'Not Applicable', nan, 'Mid-Senior level',
       'Associate', 'Entry level', 'Executive', 'Director'], dtype=object)

In [14]:
# handle required_experience - keep na values as unknown
df['required_experience'] = df['required_experience'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_experience')

In [15]:
df['required_education'].unique()

array([nan, "Bachelor's Degree", "Master's Degree",
       'High School or equivalent', 'Unspecified',
       'Some College Coursework Completed', 'Vocational', 'Certification',
       'Associate Degree', 'Professional', 'Doctorate',
       'Some High School Coursework', 'Vocational - Degree',
       'Vocational - HS Diploma'], dtype=object)

In [16]:
# handle required_education - keep na values as unknown
df['required_education'] = df['required_education'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_education')

In [17]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education']

In [18]:
# handle industry - keep the industries - fillna with unknown
df['industry'] = df['industry'].fillna('Unknown')

# a lot of sparse values noted for industries, may lead to unexpected results - to handle - keep countries with counts >10 only and put unknown for the rest
ls_of_industries = [industry if df[df['industry']==industry]['industry'].count() >= 10 else 'Others' for industry in df['industry'].unique()]
df['industry'] = df['industry'].apply(lambda x : x if x in ls_of_industries else 'Others')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('industry')

In [19]:
# handle function - keep the functions - fillna with unknown
df['function'] = df['function'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('function')

In [20]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function']

In [21]:
cols_to_drop

['location',
 'description',
 'department',
 'salary_range',
 'company_profile',
 'requirements',
 'benefits']

## Next we look into the non null columns to extract what we wanna keep

In [22]:
# drop the cols inside cols_to_drop as it is not useful anymore
df = df.drop(cols_to_drop, axis=1)

In [23]:
cols_to_drop = []

In [24]:
df.isna().sum()

job_id                 0
title                  0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
city                   0
has_description        0
has_department         0
has_salary             0
has_company_profile    0
has_requirements       0
has_benefits           0
dtype: int64

In [25]:
# handle job_id - it is unique for all - so not much value - remove the col
df = df.drop('job_id', axis=1)

In [26]:
# handle title - a bit too many distinct roles - may lead to overfitting issues - would remove the cols - flexible to choose to ohe or use as text feature
df = df.drop('title', axis=1)

In [27]:
# handle telecommuting, has_company_logo, has_questions - all are binary - keep as features
# keep track in cols_to_keep
cols_to_keep.append('telecommuting')
cols_to_keep.append('has_company_logo')
cols_to_keep.append('has_questions')

And with that all the columns are processed accordingly

In [28]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function',
 'telecommuting',
 'has_company_logo',
 'has_questions']

## Text Processing - including the text information into the model

In [29]:
import warnings

# Ignore FutureWarnings and DeprecationWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [30]:
# import relevant packages
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [31]:
# Defining a Function to clean up the text information
def text_preprocess(ds: pd.Series) -> pd.Series:
    for m in range(len(ds)):

        main_words = re.sub('[^a-zA-Z]', ' ', ds[m])                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords

        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word

        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

In [32]:
# extract text features only - title, company_profile, description, requirements, benefits
df['full_text'] = data['title'] + data['company_profile'] + data['description'] + data['requirements'] + data['benefits']
df['full_text'] = df['full_text'].fillna(' ')

# preprocess the text feature
df['full_text'] = text_preprocess(df['full_text'])
text_feature = df['full_text']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words


## Train-test split and preprocessing before inputting into model

In [33]:
# seperate out the correctly formatted cols and those which still needs processing through ohe
binary_cols = ['has_description', 'has_salary', 'has_company_profile', 'has_requirements', 'has_benefits', 'telecommuting', 'has_company_logo', 'has_questions']
ohe_needed_cols = []

for cols in cols_to_keep:
  if cols not in binary_cols:
    ohe_needed_cols.append(cols)

In [34]:
# select the fraudulent column as target, rest as features
features = df.drop(['fraudulent', 'full_text'], axis=1)
target_var = df['fraudulent']

In [35]:
# do ohe for ohe_needed_cols
features_encoded = pd.get_dummies(features, columns=ohe_needed_cols).astype(int)

In [36]:
features_encoded.head() # check the colummns if its in correct format

Unnamed: 0,telecommuting,has_company_logo,has_questions,has_description,has_salary,has_company_profile,has_requirements,has_benefits,country_AE,country_AT,...,function_Purchasing,function_Quality Assurance,function_Research,function_Sales,function_Science,function_Strategy/Planning,function_Supply Chain,function_Training,function_Unknown,function_Writing/Editing
0,0,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,1,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,1,1,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# train test split v2 - with text features included
from sklearn.model_selection import train_test_split

# concat text features and ohe encoded features
text_df = text_feature
features_concated = pd.concat([features_encoded, text_df], axis=1)

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(features_concated, target_var, random_state=42, test_size=0.2)

In [38]:
# Building a TF IDF matrix out of the text information
train_text_feature = xtrain.reset_index(drop=True)['full_text']
test_text_feature = xtest.reset_index(drop=True)['full_text']
xtrain = xtrain.reset_index(drop=True).drop('full_text', axis=1)
xtest = xtest.reset_index(drop=True).drop('full_text', axis=1)

td = TfidfVectorizer(max_features = 1000)
train_text_matrix = td.fit_transform(train_text_feature).toarray()
test_text_matrix = td.transform(test_text_feature).toarray()
feature_names = td.get_feature_names_out()

# concatenate text and encoded features
train_text_matrix = pd.DataFrame(train_text_matrix, columns=feature_names).reset_index(drop=True)
test_text_matrix = pd.DataFrame(test_text_matrix, columns=feature_names).reset_index(drop=True)
xtrain = pd.concat([xtrain, train_text_matrix], axis=1, ignore_index=True)
xtest = pd.concat([xtest, test_text_matrix], axis=1, ignore_index=True)

In [39]:
print(xtrain.shape)

(14304, 1546)


## Model Evaluation

In [40]:
# model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
from xgboost import XGBClassifier

In [41]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore FutureWarnings and DeprecationWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

In [42]:
# import the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() ,
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(),
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain, ytrain)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain
            pred = ytrain
            title = 'Train'
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred)
        y_probs = model.predict_proba(to_pred)[:, 1]
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5)
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,97,67,90,54,98.0
LogisticRegression _ Test Details,97,63,87,49,97.0
KNeighborsClassifier _ Train Details,98,82,87,78,99.0
KNeighborsClassifier _ Test Details,98,76,79,74,96.0
DecisionTreeClassifier _ Train Details,100,97,97,96,100.0
DecisionTreeClassifier _ Test Details,97,72,74,71,87.0
ExtraTreeClassifier _ Train Details,100,97,97,96,100.0
ExtraTreeClassifier _ Test Details,97,69,71,68,85.0
RandomForestClassifier _ Train Details,100,96,97,96,100.0
RandomForestClassifier _ Test Details,98,76,92,65,98.0


## Use SMOTE for oversampling

In [43]:
from imblearn.over_sampling import SMOTENC # smotenc is used as it can handle categorical variable
from imblearn.over_sampling import SMOTE

# oversampling
smote = SMOTE(random_state=42)
xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)

In [44]:
# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() ,
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(),
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled
            pred = ytrain_resampled
            title = 'Train'
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred)
        y_probs = model.predict_proba(to_pred)[:, 1]
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5)
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,97,97,96,98,100.0
LogisticRegression _ Test Details,95,62,49,86,96.0
KNeighborsClassifier _ Train Details,95,95,93,97,99.0
KNeighborsClassifier _ Test Details,89,46,31,92,95.0
DecisionTreeClassifier _ Train Details,100,100,100,100,100.0
DecisionTreeClassifier _ Test Details,96,64,54,77,87.0
ExtraTreeClassifier _ Train Details,100,100,100,100,100.0
ExtraTreeClassifier _ Test Details,94,58,47,75,85.0
RandomForestClassifier _ Train Details,100,100,100,100,100.0
RandomForestClassifier _ Test Details,97,74,67,83,98.0


# LSTM Model (Before SMOTE)

In [45]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.python.ops.math_ops import reduce_prod
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding,Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras import Input

numeric_cols = xtrain.columns

metrics_list = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc')]

def build_bilstm():

  numeric_inputs = Input(shape=(len(numeric_cols),), name='numeric_inputs')
  num_inp=layers.Dense(128, activation='relu', name='dense_num_1',
                kernel_initializer=tf.keras.initializers.he_normal(seed=42))(numeric_inputs)
  num_inp=layers.Dropout(0.1)(num_inp)
  num_inp=layers.Dense(128, activation='relu', name='dense_num_2',
                kernel_initializer=tf.keras.initializers.he_normal(seed=42))(num_inp)
  inputs = keras.layers.concatenate([num_inp])
  x = layers.Dense(32, activation='relu', name='dense_1')(inputs)
  output=layers.Dense(1, activation='sigmoid', name='sigmoid_output')(x)

  bilstm_model = Model(inputs=[numeric_inputs], outputs=[output])
  return bilstm_model

bilstm_model = build_bilstm()
print(bilstm_model.summary())

bilstm_model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics = ['accuracy','AUC'])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 numeric_inputs (InputLayer  [(None, 1546)]            0         
 )                                                               
                                                                 
 dense_num_1 (Dense)         (None, 128)               198016    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_num_2 (Dense)         (None, 128)               16512     
                                                                 
 concatenate (Concatenate)   (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                             

In [46]:
history = bilstm_model.fit([xtrain],ytrain, epochs=25, batch_size = 24, verbose = 1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [47]:
y_pred = bilstm_model.predict([xtest])
y_pred=np.where(y_pred>0.5,1,0)



In [48]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,y_pred)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def getMetrics(y_test,y_pred):
    accuracy = np.round(accuracy_score(ytest, y_pred),4)
    f1 = np.round(f1_score(ytest, y_pred),4)
    precision = np.round(precision_score(ytest, y_pred),4)
    recall = np.round(recall_score(ytest, y_pred),4)
    auroc = np.round(roc_auc_score(ytest, y_pred),4)
    return accuracy,f1,precision,recall,auroc

df_metric_list = pd.DataFrame([getMetrics(ytest,y_pred)])
df_metric_list.columns = ['accuracy','f1_score','precision','recall','auroc']
df_metric_list

Unnamed: 0,accuracy,f1_score,precision,recall,auroc
0,0.9801,0.7893,0.8526,0.7348,0.864


# LSTM (After SMOTE)

In [49]:
history = bilstm_model.fit([xtrain_resampled],ytrain_resampled, epochs=25, batch_size = 24, verbose = 1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [50]:
y_pred = bilstm_model.predict([xtest])
y_pred=np.where(y_pred>0.5,1,0)



In [51]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,y_pred)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def getMetrics(y_test,y_pred):
    accuracy = np.round(accuracy_score(ytest, y_pred),4)
    f1 = np.round(f1_score(ytest, y_pred),4)
    precision = np.round(precision_score(ytest, y_pred),4)
    recall = np.round(recall_score(ytest, y_pred),4)
    auroc = np.round(roc_auc_score(ytest, y_pred),4)
    return accuracy,f1,precision,recall,auroc

df_metric_list = pd.DataFrame([getMetrics(ytest,y_pred)])
df_metric_list.columns = ['accuracy','f1_score','precision','recall','auroc']
df_metric_list

Unnamed: 0,accuracy,f1_score,precision,recall,auroc
0,0.9782,0.7869,0.7784,0.7956,0.8918
