<a href="https://colab.research.google.com/github/Tony9801/2021-repo/blob/main/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is the first try on our model. Using 552 samples.


1.Import requried modules

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from email.parser import BytesParser
from email.policy import default
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix
from tensorflow.keras import regularizers

2.Import data

Only use "Subject", "Content-Type" and "To" attributes for the header part.
Two folders: training -> training data
test -> testing data

In [None]:
def load_and_preprocess_data(directory):
    emails = []
    labels = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.eml'):
                with open(os.path.join(root, file), 'rb') as f:
                    msg = BytesParser(policy=default).parse(f)
                    
                    subject = msg.get('Subject', '')
                    content_type = msg.get('Content-Type', '')
                    from_ = msg.get('To')

                    email_content = msg.get_payload(decode=True)
                    if isinstance(email_content, bytes):
                        email_content = email_content.decode(errors='ignore')
                    
                    
                    combined_content = f"{subject} {content_type}{from_}{email_content}"
                    emails.append(combined_content)
                    labels.append(1 if 'phishing' in root else 0)
    return emails, labels


emails, labels = load_and_preprocess_data('/content/train')
print('total number of training samples is ' + str(len(emails)))


test_emails, test_labels = load_and_preprocess_data('/content/test')
print('total number of testing samples is ' + str(len(test_emails)))

total number of training samples is 2826
total number of testing samples is 719


3. Feature Extraction

    TF-IDF, which stands for Term Frequency-Inverse Document Frequency, is a numerical statistic used in information retrieval, natural language processing, and text mining to quantify the importance of a term in a document within a collection or corpus of documents. The TF-IDF value increases proportionally to the number of times a term appears in a document and is offset by the frequency of the term in the corpus.

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=8000)
email_features = vectorizer.fit_transform(emails).toarray()
test_email_features = vectorizer.fit_transform(test_emails).toarray()

In [None]:
X_train = vectorizer.fit_transform(emails).toarray()
X_test = vectorizer.transform(test_emails).toarray()
y_train = np.array(labels)
y_test = np.array(test_labels)

5. Define the model 1

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(8000,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.4)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.2)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

6. Train the model 1

In [None]:
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc07f3ee280>

7. Evaluate the model

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy_1 = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_1:.4f}")

cm = confusion_matrix(y_test, y_pred)

TN, FP, FN, TP = cm.ravel()

FPR = FP / (FP + TN)
FNR = FN / (FN + TP)

print(f"False Positive Rate: {FPR:.4f}")
print(f"False Negative Rate: {FNR:.4f}")

Accuracy: 0.9930
False Positive Rate: 0.0075
False Negative Rate: 0.0063


7. Use recurrent layers

In [None]:
max_sequence_length = 100
vocab_size = 10000

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(emails)
train_sequences = tokenizer.texts_to_sequences(emails)
test_sequences = tokenizer.texts_to_sequences(test_emails)

X_train_RNN = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
X_test_RNN = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

embedding_size = 200

model_RNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_sequence_length),
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.1)),
    tf.keras.layers.Dropout(0.8),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_RNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_RNN.fit(X_train_RNN, y_train, epochs=13, batch_size=32, validation_split=0.2)



Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<keras.callbacks.History at 0x7fc07f0766a0>

In [None]:
y_pred_RNN = (model_RNN.predict(X_test_RNN) > 0.5).astype(int)
accuracy2 = accuracy_score(y_test, y_pred_RNN)
print(f"Accuracy: {accuracy2:.4f}")

cm = confusion_matrix(y_test, y_pred_RNN)

TN, FP, FN, TP = cm.ravel()

FPR = FP / (FP + TN)
FNR = FN / (FN + TP)

print(f"False Positive Rate: {FPR:.4f}")
print(f"False Negative Rate: {FNR:.4f}")

Accuracy: 0.9930
False Positive Rate: 0.0050
False Negative Rate: 0.0094
