In [1]:
import os
import pandas as pd
import re

In [2]:
project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"

In [3]:
def normal_extract_subject_body(text):
    # Extract the subject using a regular expression
    subject_match = re.search(r"Subject: (.*)", text)
    subject = subject_match.group(1) if subject_match else "Subject Not Found"

    # Clean the text by removing everything from "Message-ID:" up to "X-FileName:"
    cleaned_text = re.sub(r"Message-ID:.*?X-FileName:.*?\n", "", text, flags=re.S)

    return subject, cleaned_text

In [4]:
phishing_data_path = os.path.join(
    project_root, "data", "raw", "CaptstoneProjectData_2024.csv"
)
normal_data_path = os.path.join(
    project_root, "data", "raw", "EnronEmailDataset.csv"
)

phishing_data = pd.read_csv(phishing_data_path)
normal_data = pd.read_csv(
    normal_data_path, nrows=2500
)  # Read only 5000 rows of normal data for now

phishing_data.drop(columns=["Unnamed: 2", "Unnamed: 3"], inplace=True)
phishing_data.dropna(
    subset=["Subject", "Body"], inplace=True
)  # Remove rows with missing subject or body
normal_data[["Subject", "Body"]] = normal_data["message"].apply(
    lambda x: pd.Series(normal_extract_subject_body(x))
)

phishing_data.head()

Unnamed: 0,Subject,Body
0,®Review your shipment details / Shipment Notif...,Notice: This message was sent from outside the...
1,Υоur ассоunt іѕ оn hоld,\r\nVotre réponse a bien été prise en compte.\...
2,Completed: Invoice # KZ89TYS2564 from-Bestbuy....,Notice: This message was sent from outside the...
3,UVic IMPORTANT NOTICE!,Your UVIC account has been filed under the lis...
4,You have (6) Suspended incoming messages,\r\n\r\nMessage generated from uvic.ca source...


In [5]:
from homoglyphs import feature_homoglyphs

df_phishing_data_homoglyphs = feature_homoglyphs(phishing_data)
df_normal_data_homoglyphs = feature_homoglyphs(normal_data)

print("Phishing Data")
print(df_phishing_data_homoglyphs.head())
print("Normal Data")
print(df_phishing_data_homoglyphs.head())

Phishing Data
   Homoglyphs
0           0
1          91
2           0
3           0
4           0
Normal Data
   Homoglyphs
0           0
1          91
2           0
3           0
4           0


In [6]:
from abnormal_number_extract import extract_abnormal_number

df_phishing_data_abnormal_number = extract_abnormal_number(phishing_data)
df_normal_data_abnormal_number = extract_abnormal_number(normal_data)

print("Phishing Data")
print(df_phishing_data_abnormal_number.head())
print("Normal Data")
print(df_normal_data_abnormal_number.head())

Phishing Data
   Amount  Tracking  Postal  Domestic_Phone  International_Phone
0       0         0       0               0                    0
1       0         0       0               0                    0
2       1         0       0               0                    0
3       0         0       0               0                    0
4       0         0       0               0                    0
Normal Data
   Amount  Tracking  Postal  Domestic_Phone  International_Phone
0       0         0       0               0                    0
1       0         0       0               0                    0
2       0         0       0               0                    0
3       0         0       0               0                    0
4       0         0       0               0                    0


In [7]:
from ip_JS_address_link import extract_features_from_message

# Example application of new feature extraction
normal_data['features'] = normal_data['message'].apply(extract_features_from_message)
phishing_data['features'] = phishing_data.apply(
    lambda row: extract_features_from_message(
        str(row['Subject']) + ' ' + str(row['Body'])
    ), axis=1
)

# Convert the extracted features to DataFrame
columns = ['length', 'num_links', 'num_special_chars', 'num_keywords', 'num_ip_addresses', 'html_tags', 'js_code']
df_normal_specials = pd.DataFrame(normal_data['features'].tolist(), columns=columns)
df_phishing_specials = pd.DataFrame(phishing_data['features'].tolist(), columns=columns)
print("Phishing Data")
print(df_phishing_specials.head())
print("Normal Data")
print(df_normal_specials.head())

Phishing Data
   length  num_links  num_special_chars  num_keywords  num_ip_addresses  \
0    1059          2                 40             0                 0   
1    1324          3                 50             1                 0   
2    3243          9                128             1                 0   
3     550          1                 11             8                 0   
4    1255          2                 34             1                 0   

   html_tags  js_code  
0       True    False  
1       True    False  
2       True    False  
3       True    False  
4       True    False  
Normal Data
   length  num_links  num_special_chars  num_keywords  num_ip_addresses  \
0      24          0                  0             0                 0   
1     787          0                 11             0                 0   
2      31          0                  4             0                 0   
3     188          0                  5             0                 0   
4   

In [8]:
from phishing_bow import feature_extract_bow

df_phishing_data_bow = feature_extract_bow(phishing_data)
df_normal_data_bow = feature_extract_bow(normal_data)

print("Phishing Data")
print(df_phishing_data_bow.head())
print("Normal Data")
print(df_normal_data_bow.head())

[nltk_data] Downloading package punkt to /Users/feiyixie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Phishing Data
   bow_access  bow_account  bow_action  bow_address  bow_attachment  \
0           0            0           1            2               0   
1           0            0           0            0               0   
2           1            0           0            0               0   
3           0            5           0            0               0   
4           0            0           2            1               0   

   bow_automatically  bow_bank  bow_best  bow_business  bow_change  ...  \
0                  0         0         0             0           0  ...   
1                  0         0         0             0           0  ...   
2                  0         0         3             1           0  ...   
3                  0         0         0             0           0  ...   
4                  0         0         0             0           0  ...   

   bow_uvic  bow_verify  bow_victoria  bow_view  bow_visit  bow_want  bow_web  \
0         0           0    

In [9]:
from phishing_tfidf import feature_extract_tfidf

df_phishing_data_tfidf = feature_extract_tfidf(phishing_data)
df_normal_data_tfidf = feature_extract_tfidf(normal_data)

print("Phishing Data")
print(df_phishing_data_tfidf.head())
print("Normal Data")
print(df_phishing_data_tfidf.head())

[nltk_data] Downloading package punkt to /Users/feiyixie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/feiyixie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Phishing Data
   tfidf_access  tfidf_account  tfidf_action  tfidf_address  tfidf_attachment  \
0       0.00000       0.000000      0.225402       0.327261               0.0   
1       0.00000       0.000000      0.000000       0.000000               0.0   
2       0.07028       0.000000      0.000000       0.000000               0.0   
3       0.00000       0.430048      0.000000       0.000000               0.0   
4       0.00000       0.000000      0.317442       0.115223               0.0   

   tfidf_automatically  tfidf_bank  tfidf_best  tfidf_business  tfidf_change  \
0                  0.0         0.0    0.000000        0.000000           0.0   
1                  0.0         0.0    0.000000        0.000000           0.0   
2                  0.0         0.0    0.194923        0.068991           0.0   
3                  0.0         0.0    0.000000        0.000000           0.0   
4                  0.0         0.0    0.000000        0.000000           0.0   

   ...  tfidf_uvic

In [12]:
df_phishing_features_tfidf = pd.concat([df_phishing_data_homoglyphs, df_phishing_data_abnormal_number, df_phishing_specials, df_phishing_data_tfidf], axis=1)
df_normal_features_tfidf = pd.concat([df_normal_data_homoglyphs, df_normal_data_abnormal_number, df_normal_specials, df_normal_data_tfidf], axis=1)
df_phishing_features_bow = pd.concat([df_phishing_data_homoglyphs, df_phishing_data_abnormal_number, df_phishing_specials, df_phishing_data_bow], axis=1)
df_normal_features_bow = pd.concat([df_normal_data_homoglyphs, df_normal_data_abnormal_number, df_normal_specials, df_normal_data_bow], axis=1)