# SMiShing attack detection using ML-approach (_Random Forest_) and DeepLearningML-approaches (_LSTM_ & _Bidirectional LSTM_)


Import _packages_:

In [None]:
from IPython.display import display

# import libraries for reading data, exploring and plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# library for train test split
from sklearn.model_selection import train_test_split
# deep learning libraries for text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Modeling 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional
import tensorflow.keras.metrics 
from keras.wrappers.scikit_learn import KerasClassifier

from keras import backend as K

Declare _variables_:

In [None]:
#constants

max_len = 1
trunc_type = "post" 
padding_type = "post" 
vocab_size = 2082
embeding_dim = 16
drop_value = 0.2 
n_dense = 24
e = 10000
n_lstm = 20
drop_lstm =0.2

Now we transform messages into vectors that will be used in further training of the model:

In [None]:
messages = pd.read_csv('data.txt', sep ='\t',names=["label", "message"])
duplicatedRow = messages[messages.duplicated()]
messages.groupby('label').describe().T

ham_msg = messages[messages.label =='LEGI']
spam_msg = messages[messages.label=='SPAM']
smis_msg = messages[messages.label=='SMIS']

ham_msg_text = " ".join(ham_msg.message.to_numpy().tolist())
spam_msg_text = " ".join(spam_msg.message.to_numpy().tolist())
smis_msg_text = " ".join(smis_msg.message.to_numpy().tolist())


ham_msg_df = ham_msg.sample(n = len(smis_msg), random_state = 44)
smis_msg_df = smis_msg

msg_df = ham_msg_df.append(smis_msg_df).reset_index(drop=True)
msg_df['text_length'] = msg_df['message'].apply(len)
labels = msg_df.groupby('label').mean()

msg_df['msg_type']= msg_df['label'].map({'LEGI': 0,'SPAM': 0, 'SMIS': 1})
msg_label = msg_df['msg_type'].values
X_train, X_test, y_train, y_test = train_test_split(msg_df['message'], msg_label, test_size=0.2, random_state=434)


In [None]:
tokenizer = Tokenizer(num_words = vocab_size, char_level=False)
tokenizer.fit_on_texts(X_train)
 
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences (training_sequences, maxlen = max_len, padding = padding_type, truncating = trunc_type )
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen = max_len, padding = padding_type, truncating = trunc_type)

Now we can create and fit ML models for smishing detection.

 ## LSTM 

First type of models based on DeepLearning is [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory).

In [None]:
#==========================================================LSTM==========================================================

model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
model.add(Dense(max_len, activation='sigmoid'))
#model.summary()



model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['FalsePositives','FalseNegatives','TruePositives','TrueNegatives'])


display('='* 40 + 'LSTM' + '=' * 40)


history = model.fit(training_padded, y_train, epochs=e, validation_data=(testing_padded, y_test), verbose=0)



best_fp_score = min(history.history['val_false_positives'])
best_fn_score = min(history.history['val_false_negatives'])
FAR = best_fp_score / (best_fp_score + max(history.history['val_true_negatives']))
FRR = best_fn_score / (best_fn_score + max(history.history['val_true_positives']))
print('FAR: ', FAR * 100,'%')
print('FRR: ', FRR * 100, '%')

## Bidirectional LSTM

Bidirectional LSTM differs from LSTM in that Bidirectional examines patterns both before and after a given token in the document. Due to this, the computation time is increased compared to the LSTM. However, in most cases B-LSTM gives the best accuracy.

In [None]:
#==========================================================Bidirectional LSTM==========================================================

model1 = Sequential()
model1.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model1.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
model1.add(Dense(max_len, activation='sigmoid'))

model1.compile(loss = 'binary_crossentropy', 
               optimizer = 'nadam', 
               metrics=['FalsePositives','FalseNegatives','TruePositives','TrueNegatives'])

display('=' * 40 + 'Bidirectional LSTM' + '=' * 40)


history1 = model1.fit(training_padded, y_train, epochs=e, validation_data=(testing_padded, y_test), verbose=0)
best_fp_score1 = min(history1.history['val_false_positives'])
best_fn_score1 = min(history1.history['val_false_negatives'])
FAR1 = best_fp_score1 / (best_fp_score1 + max(history1.history['val_true_negatives']))
FRR1 = best_fn_score1 / (best_fn_score1 + max(history1.history['val_true_positives']))
print('FAR: ', FAR1 * 100,'%')
print('FRR: ', FRR1 * 100, '%')

## Random Forest Classifier

One of the machine learning algorithms presented in sklearn is [Random Forest](https://habr.com/ru/post/320726/).

In [None]:
#==========================================================RFT==========================================================

display('='* 40 + 'RFC' + '=' * 40)

m = RandomForestClassifier(n_estimators = 2082)
h = m.fit(training_padded,y_train)
y_test_pred = m.predict(testing_padded)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
far = fp / (fp + tn)
frr = fn / (fn + tp)
print('FAR: ', far*100, '%')
print('FRR: ', frr*100, '%')