# NoFraud Educational Scanner

## AI Analysis Section

Imports 

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
tf.gfile = tf.io.gfile
import tensorflow_hub as hub
from bert import tokenization
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow import keras
from official.nlp import optimization
import os
import tensorflow_text
import re
from nltk.tokenize import word_tokenize

load in data and concatenate it

In [6]:
fraud = pd.read_csv("./fraud_email_.csv")
phis = pd.read_csv("./curated_set.csv")
spam = pd.read_csv("./spam_ham_dataset.csv")

Preview the data

In [7]:
fraud.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [8]:
phis.head()

Unnamed: 0.1,Unnamed: 0,text,source,is_phishing
0,0,Subject: ena offsite\nmy suggestions :\n1 ) mo...,ENRON,0
1,1,Subject: allegheny energy s - 3\ni received wo...,ENRON,0
2,2,The University of Washington System is sharing...,https://ciso.uw.edu/education/more-phishing-ex...,1
3,3,"Dear user@stanford.edu,\n\nA private document ...",https://uit.stanford.edu/phishing,1
4,4,Subject: james valverde - interview schedule\n...,ENRON,0


In [9]:
spam.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


change label numbers for each dataset to make prediction in one set

<ul>
    <li>0 - benign</li>
    <li>1 - fraud</li>
    <li>2 - phishing</li>
    <li>3 - spam</li>
</ul>

In [26]:
# phishing stuff

phisC = phis.copy()
for i in range(len(phisC["is_phishing"])):
    if phisC.at[i,"is_phishing"] == 1:
        phisC.at[i,"is_phishing"] =2

phisC = phisC.drop(columns=["Unnamed: 0","source"])
phisC.head()

Unnamed: 0,text,is_phishing
0,Subject: ena offsite\nmy suggestions :\n1 ) mo...,0
1,Subject: allegheny energy s - 3\ni received wo...,0
2,The University of Washington System is sharing...,2
3,"Dear user@stanford.edu,\n\nA private document ...",2
4,Subject: james valverde - interview schedule\n...,0


In [27]:
# spam stuff

spamC = spam.copy()
for i in range(len(spam["label_num"])):
    if spamC.at[i,"label_num"] == 1:
        spamC.at[i,"label_num"] =3

spamC = spamC.drop(columns=["Unnamed: 0","label"])
spamC.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",3
4,Subject: re : indian springs\r\nthis deal is t...,0


In [29]:
fraudC = fraud.copy()
fraudC.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [41]:
texts = []
classes = []

for i in range(len(fraudC)):
    texts.append(fraudC.at[i,"Text"])
    classes.append(fraudC.at[i,"Class"])

for i in range(len(spamC)):
    texts.append(spamC.at[i,"text"])
    classes.append(spamC.at[i,"label_num"])

for i in range(len(phisC)):
    texts.append(phisC.at[i,"text"])
    classes.append(phisC.at[i,"is_phishing"])


In [42]:
#concatenate all the datasets

fulldf = pd.DataFrame({'text':texts,'class':classes}) 

In [44]:
fulldf.head()

Unnamed: 0,text,class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


### Now that the data is concatenated, we can preprocess

In [2]:

def clean_data(line):
    line = line.lower() #makes it lowercase

    line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line) #takes out any symbols

    tokens = word_tokenize(line)

    words = [word for word in tokens if word.isalpha()] #check if only letters (no special chars/symbols)

 
    return words

for i in range(len(fulldf["text"])):
    fulldf.at[i,"text"] = clean_data(fulldf.at[i,"text"])

In [48]:
#split that data into train and test
X_train, X_test, y_train, y_test = train_test_split(fulldf["text"],fulldf["class"], test_size=0.2,stratify=None)
train_labels = keras.utils.to_categorical(y_train.values, num_classes=4)
test_labels = keras.utils.to_categorical(y_test.values, num_classes=4)

# Make a model using BERT transforms to understand the data further

In [51]:
def model(metrics):

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    preprocessor = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    
    encoder_inputs = preprocessor(text_input)
    encoder = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
        trainable=True)
    
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"] 
    sequence_output = outputs["sequence_output"]

    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(32, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(4, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=text_input, outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=metrics)
    
    return model

In [52]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

noFraud_model = model(METRICS)
noFraud_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_mask': (Non  0           ['input_2[0][0]']                
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [53]:
train_history = noFraud_model.fit(
    X_train, train_labels, 
    validation_split=0.2,
    epochs=3,
    verbose=1
)

Epoch 1/3
  3/349 [..............................] - ETA: 5:01:32 - loss: 1.4823 - accuracy: 0.6745 - precision: 0.3117 - recall: 0.2500