In [1]:
import os
import pandas as pd
import re

In [2]:
project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"
normal_sample_numbers = 2571 * 10

In [3]:
def normal_extract_subject_body(text):
    # Extract the subject using a regular expression
    subject_match = re.search(r"Subject: (.*)", text)
    subject = subject_match.group(1) if subject_match else "Subject Not Found"

    # Clean the text by removing everything from "Message-ID:" up to "X-FileName:"
    cleaned_text = re.sub(r"Message-ID:.*?X-FileName:.*?\n", "", text, flags=re.S)

    return subject, cleaned_text

In [4]:
phishing_data_path = os.path.join(
    project_root, "data", "raw", "CaptstoneProjectData_2024.csv"
)
normal_data_path = os.path.join(
    project_root, "data", "raw", "EnronEmailDataset.csv"
)

phishing_data = pd.read_csv(phishing_data_path)
normal_data = pd.read_csv(
    normal_data_path, nrows=normal_sample_numbers
)

phishing_data.drop(columns=["Unnamed: 2", "Unnamed: 3"], inplace=True)
phishing_data["Subject"] = phishing_data["Subject"].fillna("Subject Not Found")
phishing_data.dropna(
    subset=["Body"], inplace=True
)
phishing_data.reset_index(drop=True, inplace=True)
normal_data[["Subject", "Body"]] = normal_data["message"].apply(
    lambda x: pd.Series(normal_extract_subject_body(x))
)

phishing_data["Label"] = 1
normal_data["Label"] = 0

In [5]:
df_data = pd.concat([phishing_data, normal_data], ignore_index=True)
df_data.info()
df_labels = df_data["Label"]
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  25710 non-null  object
 1   Body     25710 non-null  object
 2   Label    25710 non-null  int64 
 3   file     23139 non-null  object
 4   message  23139 non-null  object
dtypes: int64(1), object(4)
memory usage: 1004.4+ KB
<class 'pandas.core.series.Series'>
RangeIndex: 25710 entries, 0 to 25709
Series name: Label
Non-Null Count  Dtype
--------------  -----
25710 non-null  int64
dtypes: int64(1)
memory usage: 201.0 KB


In [6]:
from homoglyphs import feature_homoglyphs

df_homoglyphs = feature_homoglyphs(df_data.copy())
df_homoglyphs.info()
df_homoglyphs

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Homoglyphs  25710 non-null  int64
dtypes: int64(1)
memory usage: 201.0 KB


Unnamed: 0,Homoglyphs
0,0
1,91
2,0
3,0
4,0
...,...
25705,0
25706,0
25707,0
25708,0


In [7]:
from abnormal_number_extract import extract_abnormal_number

df_abnormal = extract_abnormal_number(df_data.copy())
df_abnormal.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Amount               25710 non-null  int64
 1   Tracking             25710 non-null  int64
 2   Postal               25710 non-null  int64
 3   Domestic_Phone       25710 non-null  int64
 4   International_Phone  25710 non-null  int64
dtypes: int64(5)
memory usage: 1004.4 KB


In [8]:
from html_JS import extract_features_from_message

df_specials = df_data.copy().apply(
    lambda row: extract_features_from_message(
        str(row['Subject']) + ' ' + str(row['Body'])
    ), axis=1
)
df_specials = pd.DataFrame(df_specials.tolist(), columns=['html_tags', 'js_code'])
df_specials.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   html_tags  25710 non-null  bool 
 1   js_code    25710 non-null  bool 
dtypes: bool(2)
memory usage: 50.3 KB


In [9]:
from phishing_bow import feature_extract_bow

df_bow = feature_extract_bow(df_data.copy())
df_bow.info()

[nltk_data] Downloading package punkt to /Users/feiyixie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Columns: 192 entries, bow_able to bow_year
dtypes: int64(192)
memory usage: 37.7 MB


In [10]:
from phishing_tfidf import feature_extract_tfidf

df_tfidf = feature_extract_tfidf(df_data.copy())
df_tfidf.info()

[nltk_data] Downloading package punkt to /Users/feiyixie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25710 entries, 0 to 25709
Columns: 192 entries, tfidf_able to tfidf_year
dtypes: float64(192)
memory usage: 37.7 MB


In [11]:
df_features = pd.concat([df_homoglyphs, df_abnormal, df_specials], axis=1)

df_features_labels = pd.concat([df_features, df_labels], axis=1)
df_bow_labels = pd.concat([df_bow, df_labels], axis=1)
df_tfidf_labels = pd.concat([df_tfidf, df_labels], axis=1)
df_features_bow_labels = pd.concat([df_features, df_bow, df_labels], axis=1)
df_features_tfidf_labels = pd.concat([df_features, df_tfidf, df_labels], axis=1)

In [12]:
df_features_labels = df_features_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_bow_labels = df_bow_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_tfidf_labels = df_tfidf_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_features_bow_labels = df_features_bow_labels.sample(frac=1, random_state=42).reset_index(drop=True)
df_features_tfidf_labels = df_features_tfidf_labels.sample(frac=1, random_state=42).reset_index(drop=True)


In [13]:
# save the data
df_features_labels.to_csv(os.path.join(project_root, "data", "processed", "features_labels.csv"), index=False)
df_bow_labels.to_csv(os.path.join(project_root, "data", "processed", "bow_labels.csv"), index=False)
df_tfidf_labels.to_csv(os.path.join(project_root, "data", "processed", "tfidf_labels.csv"), index=False)
df_features_bow_labels.to_csv(os.path.join(project_root, "data", "processed", "features_bow_labels.csv"), index=False)
df_features_tfidf_labels.to_csv(os.path.join(project_root, "data", "processed", "features_tfidf_labels.csv"), index=False)