In [1]:
from confusable_homoglyphs import confusables
import os
import pandas as pd
import re

project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"

In [2]:
phishing_data_path = os.path.join(project_root, 'data', 'raw', 'CaptstoneProjectData_2024.csv')
normal_data_path = os.path.join(project_root, 'data', 'raw', 'EnronEmailDataset.csv')

phishing_data = pd.read_csv(phishing_data_path)
normal_data = pd.read_csv(normal_data_path, nrows=10000) # Read only 5000 rows of normal data for now

print("Phishing Dataset Preview:")
print(phishing_data.head())
print("\nPhishing Dataset Info:")
print(phishing_data.info())
print("\nNormal Dataset Preview:")
print(normal_data.head())
print("\nNormal Dataset Info:")
print(normal_data.info())

Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4          You have (6) Suspended incoming messages    

                                                Body  Unnamed: 2  Unnamed: 3  
0  Notice: This message was sent from outside the...         NaN         NaN  
1  \r\nVotre réponse a bien été prise en compte.\...         NaN         NaN  
2  Notice: This message was sent from outside the...         NaN         NaN  
3  Your UVIC account has been filed under the lis...         NaN         NaN  
4  \r\n\r\nMessage generated from  uvic.ca source...         NaN         NaN  

Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 4 columns):
 #   Column      Non-Null Count  

In [3]:
# Remove unnecessary columns
phishing_data.drop(columns=['Unnamed: 2', 'Unnamed: 3'], inplace=True)
phishing_data.dropna(subset=['Subject', 'Body'], inplace=True)  # Remove rows with missing subject or body

def clean_homoglyphs(text):
    # Replace various types of whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

phishing_data['Subject'] = phishing_data['Subject'].apply(clean_homoglyphs)
phishing_data['Body'] = phishing_data['Body'].apply(clean_homoglyphs)

# Preview the cleaned dataset
print("Cleaned Phishing Dataset Preview:")
print(phishing_data.head())
print("\nCleaned Phishing Dataset Info:")
print(phishing_data.info())

Cleaned Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4           You have (6) Suspended incoming messages   

                                                Body  
0  Notice: This message was sent from outside the...  
1  Votre réponse a bien été prise en compte. [htt...  
2  Notice: This message was sent from outside the...  
3  Your UVIC account has been filed under the lis...  
4  Message generated from uvic.ca source. Sender ...  

Cleaned Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2463 entries, 0 to 2575
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  2463 non-null   object
 1   Body     2463 non-null   object
dtypes: object(2)


In [4]:
def clean_homoglyphs_normal(text):
    # Extract the subject using a regular expression
    subject_match = re.search(r"Subject: (.*)", text)
    subject = subject_match.group(1) if subject_match else "Subject Not Found"

    # Clean the text by removing everything from "Message-ID:" up to "X-FileName:"
    cleaned_text = re.sub(r"Message-ID:.*?X-FileName:.*?\n", "", text, flags=re.S)

    return subject, cleaned_text

normal_data[['Subject', 'Body']] = normal_data['message'].apply(lambda x: pd.Series(clean_homoglyphs_normal(x)))
normal_data['Subject'] = normal_data['Subject'].apply(clean_homoglyphs)
normal_data['Body'] = normal_data['Body'].apply(clean_homoglyphs)

# Display the cleaned data
print(normal_data['Subject'][500])
print(normal_data['Body'][500])

Re: Draft of Opposition to ORA/TURN petition
---------------------- Forwarded by Phillip K Allen/HOU/ECT on 01/25/2001 08:17 AM --------------------------- From: Leslie Lawner@ENRON on 01/24/2001 08:17 PM CST To: MBD <MDay@GMSSR.com> cc: Harry Kingerski/NA/Enron@Enron, Jeff Dasovich/NA/Enron@Enron, James D Steffes/NA/Enron@Enron, Phillip K Allen/HOU/ECT@ECT, Don Black/HOU/EES@EES, James Shirley/HOU/EES@EES, Frank Ermis/HOU/ECT@ECT, Paul Kaufman/PDX/ECT@ECT Subject: Re: Draft of Opposition to ORA/TURN petition Everything is short and sweet except the caption! One comment. The very last sentence reads : PG&E can continue to physically divert gas if necessary . . . " SInce they haven't actually begun to divert yet, let's change that sentence to read "PG&E has the continuing right to physically divert gas if necessary..." I will send this around for comment. Thanks for your promptness. Any comments, anyone? MBD <MDay@GMSSR.com> 01/24/2001 03:47 PM To: "'llawner@enron.com'" <llawner@enron.c

In [5]:
def count_homoglyphs_scripts(text):
    return sum(confusables.is_dangerous(word) for word in text.split())

# Apply the function to each row in your DataFrame by concatenating the 'Subject' and 'Body'
phishing_data['Homoglyphs Scripts Count'] = phishing_data.apply(lambda row: count_homoglyphs_scripts(row['Subject'] + " " + row['Body']), axis=1)
normal_data['Homoglyphs Scripts Count'] = normal_data.apply(lambda row: count_homoglyphs_scripts(row['Subject'] + " " + row['Body']), axis=1)

# Display the DataFrame with the new count of dangerous words
print("Phishing Data:")
print(phishing_data[['Homoglyphs Scripts Count']])
print("Normal Data:")
print(normal_data[['Homoglyphs Scripts Count']])

Phishing Data:
      Homoglyphs Scripts Count
0                            0
1                           91
2                            0
3                            0
4                            0
...                        ...
2571                         0
2572                         0
2573                         0
2574                         0
2575                         0

[2463 rows x 1 columns]
Normal Data:
      Homoglyphs Scripts Count
0                            0
1                            0
2                            0
3                            0
4                            0
...                        ...
9995                         0
9996                         0
9997                         0
9998                         0
9999                         0

[10000 rows x 1 columns]


In [6]:
phishing_data.drop(columns=['Subject', 'Body'], inplace=True)
normal_data.drop(columns=['file', 'message', 'Subject', 'Body'], inplace=True)

print("Phishing Data:")
print(phishing_data.info())
print(phishing_data.describe())
print("Normal Data:")
print(normal_data.info())
print(normal_data.describe())

Phishing Data:
<class 'pandas.core.frame.DataFrame'>
Index: 2463 entries, 0 to 2575
Data columns (total 1 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Homoglyphs Scripts Count  2463 non-null   int64
dtypes: int64(1)
memory usage: 38.5 KB
None
       Homoglyphs Scripts Count
count               2463.000000
mean                   0.362972
std                    4.047921
min                    0.000000
25%                    0.000000
50%                    0.000000
75%                    0.000000
max                   91.000000
Normal Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Homoglyphs Scripts Count  10000 non-null  int64
dtypes: int64(1)
memory usage: 78.3 KB
None
       Homoglyphs Scripts Count
count                   10000.0
mean  

In [7]:
phishing_data.to_csv(os.path.join(project_root, 'data', 'processed', 'homoglyphs_phishing.csv'), index=False)
normal_data.to_csv(os.path.join(project_root, 'data', 'processed', 'homoglyphs_normal.csv'), index=False)