# noFraud ML Model

### install dependencies

In [None]:
%pip install tensorflow-text

In [None]:
%pip install bert-tensorflow==1.0.1

In [None]:
%pip install -q tf-models-official==2.7.0

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Imports

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
tf.gfile = tf.io.gfile
import tensorflow_hub as hub
from bert import tokenization
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow import keras
from official.nlp import optimization
import os
import tensorflow_text
import re
from nltk.tokenize import word_tokenize

# Code to set up, preprocess, and cut-up data

In [None]:
fraud = pd.read_csv("/content/drive/MyDrive/fraud_email_.csv")
phis = pd.read_csv("/content/drive/MyDrive/curated_set.csv")
spam = pd.read_csv("/content/drive/MyDrive/spam_ham_dataset.csv")

phisC = phis.copy()
for i in range(len(phisC["is_phishing"])):
    if phisC.at[i,"is_phishing"] == 1:
        phisC.at[i,"is_phishing"] =2

phisC = phisC.drop(columns=["Unnamed: 0","source"])

# spam stuff

spamC = spam.copy()
for i in range(len(spam["label_num"])):
    if spamC.at[i,"label_num"] == 1:
        spamC.at[i,"label_num"] =3

spamC = spamC.drop(columns=["Unnamed: 0","label"])

fraudC = fraud.copy()

texts = []
classes = []

for i in range(len(fraudC)):
  texts.append(fraudC.at[i,"Text"])
  classes.append(fraudC.at[i,"Class"])

for i in range(len(spamC)):
  texts.append(spamC.at[i,"text"])
  classes.append(spamC.at[i,"label_num"])

for i in range(len(phisC)):
  texts.append(phisC.at[i,"text"])
  classes.append(phisC.at[i,"is_phishing"])

fulldf = pd.DataFrame({'text':texts,'class':classes})

def clean_data(line):
    line = str(line)
    line = line.lower() #makes it lowercase

    line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line) #takes out any symbols

    tokens = word_tokenize(line)

    words = [word for word in tokens if word.isalpha()] #check if only letters (no special chars/symbols)

 
    return ','.join(words)

for i in range(len(fulldf["text"])):
    fulldf.at[i,"text"] = clean_data(fulldf.at[i,"text"])


X_train, X_test, y_train, y_test = train_test_split(fulldf["text"],fulldf["class"], test_size=0.2,stratify=None)
train_labels = keras.utils.to_categorical(y_train.values, num_classes=4)
test_labels = keras.utils.to_categorical(y_test.values, num_classes=4)



## Define a new model

In [None]:
def model(metrics):

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessor = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    
    encoder_inputs = preprocessor(text_input)
    encoder = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
        trainable=True)
    
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"] 
    sequence_output = outputs["sequence_output"]

    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(32, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(4, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=text_input, outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=metrics)
    
    return model

## Generate the model with metrics

In [None]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

noFraud_model = model(METRICS)
noFraud_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_type_ids':   0           ['input_3[0][0]']                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

# Train the model

In [None]:
train_history = noFraud_model.fit(
    X_train, train_labels, 
    validation_split=0.2,
    epochs=3,
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Evaluate - accuracy of 99.07%

In [None]:
noFraud_model.evaluate(X_test,test_labels)



[0.05506602302193642,
 0.990676999092102,
 0.9816303253173828,
 0.9810671210289001]

## Saving

In [None]:
noFraud_model.save("./content/drive/Mydrive/10th Grade/noFraud_model")



# Sample Prediction

In [None]:
noFraud_model.predict(["gnaru"])



array([[9.7531110e-01, 2.3250505e-02, 8.7237137e-04, 5.6591589e-04]],
      dtype=float32)