In [None]:
import os
import pandas as pd
import re

In [None]:
project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"
normal_sample_numbers = 2571 * 9

In [None]:
def normal_extract_subject_body(text):
    # Extract the subject using a regular expression
    subject_match = re.search(r"Subject: (.*)", text)
    subject = subject_match.group(1) if subject_match else "Subject Not Found"

    # Clean the text by removing everything from "Message-ID:" up to "X-FileName:"
    cleaned_text = re.sub(r"Message-ID:.*?X-FileName:.*?\n", "", text, flags=re.S)

    return subject, cleaned_text

In [None]:
phishing_data_path = os.path.join(
    project_root, "data", "raw", "CaptstoneProjectData_2024.csv"
)
normal_data_path = os.path.join(
    project_root, "data", "raw", "EnronEmailDataset.csv"
)

phishing_data = pd.read_csv(phishing_data_path)
normal_data = pd.read_csv(
    normal_data_path, nrows=normal_sample_numbers
)

phishing_data.drop(columns=["Unnamed: 2", "Unnamed: 3"], inplace=True)
phishing_data["Subject"] = phishing_data["Subject"].fillna("Subject Not Found")
phishing_data.dropna(
    subset=["Body"], inplace=True
)
phishing_data.reset_index(drop=True, inplace=True)
normal_data[["Subject", "Body"]] = normal_data["message"].apply(
    lambda x: pd.Series(normal_extract_subject_body(x))
)

phishing_data["Label"] = 1
normal_data["Label"] = 0

In [None]:
df_data = pd.concat([phishing_data, normal_data], ignore_index=True)
df_data.info()
df_labels = df_data["Label"]
df_labels.info()

In [None]:
from homoglyphs import feature_homoglyphs

df_homoglyphs = feature_homoglyphs(df_data.copy())
df_homoglyphs.info()
df_homoglyphs

In [None]:
from abnormal_number_extract import extract_abnormal_number

df_abnormal = extract_abnormal_number(df_data.copy())
df_abnormal.info()


In [None]:
from html_JS import extract_features_from_message

df_specials = df_data.copy().apply(
    lambda row: extract_features_from_message(
        str(row['Subject']) + ' ' + str(row['Body'])
    ), axis=1
)
df_specials = pd.DataFrame(df_specials.tolist(), columns=['html_tags', 'js_code'])
df_specials.info()

In [None]:
from phishing_bow import feature_extract_bow

df_bow = feature_extract_bow(df_data.copy())
df_bow.info()

In [None]:
from phishing_tfidf import feature_extract_tfidf

df_tfidf = feature_extract_tfidf(df_data.copy())
df_tfidf.info()

In [None]:
df_features = pd.concat([df_homoglyphs, df_abnormal, df_specials], axis=1)

df_features_labels = pd.concat([df_features, df_labels], axis=1)
df_bow_labels = pd.concat([df_bow, df_labels], axis=1)
df_tfidf_labels = pd.concat([df_tfidf, df_labels], axis=1)
df_features_bow_labels = pd.concat([df_features, df_bow, df_labels], axis=1)
df_features_tfidf_labels = pd.concat([df_features, df_tfidf, df_labels], axis=1)

In [None]:
df_features_labels = df_features_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_bow_labels = df_bow_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_tfidf_labels = df_tfidf_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_features_bow_labels = df_features_bow_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_features_tfidf_labels = df_features_tfidf_labels.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# save the data
df_features_labels.to_csv(os.path.join(project_root, "data", "processed", "features_labels.csv"), index=False)
df_bow_labels.to_csv(os.path.join(project_root, "data", "processed", "bow_labels.csv"), index=False)
df_tfidf_labels.to_csv(os.path.join(project_root, "data", "processed", "tfidf_labels.csv"), index=False)
df_features_bow_labels.to_csv(os.path.join(project_root, "data", "processed", "features_bow_labels.csv"), index=False)
df_features_tfidf_labels.to_csv(os.path.join(project_root, "data", "processed", "features_tfidf_labels.csv"), index=False)