In [7]:
import numpy as np
import os
import pandas as pd
import re

project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"

In [8]:
phishing_data_path = os.path.join(project_root, 'data', 'raw', 'CaptstoneProjectData_2024.csv')

phishing_data = pd.read_csv(phishing_data_path)

print("Phishing Dataset Preview:")
print(phishing_data.head())
print("\nPhishing Dataset Info:")
print(phishing_data.info())

Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4          You have (6) Suspended incoming messages    

                                                Body  Unnamed: 2  Unnamed: 3  
0  Notice: This message was sent from outside the...         NaN         NaN  
1  \r\nVotre réponse a bien été prise en compte.\...         NaN         NaN  
2  Notice: This message was sent from outside the...         NaN         NaN  
3  Your UVIC account has been filed under the lis...         NaN         NaN  
4  \r\n\r\nMessage generated from  uvic.ca source...         NaN         NaN  

Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 4 columns):
 #   Column      Non-Null Count  

In [9]:
# Remove unnecessary columns
phishing_data.drop(columns=['Unnamed: 2', 'Unnamed: 3'], inplace=True)

# Handle missing values
phishing_data['Subject'].fillna('No Subject', inplace=True)  # Replace missing subjects with 'No Subject'
phishing_data['Body'].fillna('No Body', inplace=True)  # Replace missing bodies with 'No Body'

# Text cleaning
def clean_text(text):
    # Remove leading/trailing whitespace, replace line breaks and other typical cleaning tasks
    text = text.strip().replace('\r\n', ' ').replace('\n', ' ')
    # Optional: Add more cleaning steps here (e.g., regex to remove non-ASCII characters if they are not needed)
    return text

phishing_data['Subject'] = phishing_data['Subject'].apply(clean_text)
phishing_data['Body'] = phishing_data['Body'].apply(clean_text)

# Preview the cleaned dataset
print("Cleaned Phishing Dataset Preview:")
print(phishing_data.head())
print("\nCleaned Phishing Dataset Info:")
print(phishing_data.info())

Cleaned Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4           You have (6) Suspended incoming messages   

                                                Body  
0  Notice: This message was sent from outside the...  
1  Votre réponse a bien été prise en compte. [htt...  
2  Notice: This message was sent from outside the...  
3  Your UVIC account has been filed under the lis...  
4  Message generated from  uvic.ca source.   Send...  

Cleaned Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  2576 non-null   object
 1   Body     2576 non-null   object
dtypes: objec

In [10]:
# Define functions to extract emails and URLs and apply them to the DataFrame
def extract_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails

def extract_urls(text):
    url_pattern = r'\b(?:https?|ftp):\/\/\S+|\bwww\.\S+\.\S+'
    urls = re.findall(url_pattern, text)
    return urls

# Apply the functions to the dataset
phishing_data['Emails'] = phishing_data['Body'].apply(extract_emails)
phishing_data['URLs'] = phishing_data['Body'].apply(extract_urls)

# Calculate the number of emails and URLs
phishing_data['Num_Emails'] = phishing_data['Emails'].apply(len)
phishing_data['Num_URLs'] = phishing_data['URLs'].apply(len)

# Display the new features in the dataset
print(phishing_data[['Body', 'Emails', 'Num_Emails', 'URLs', 'Num_URLs']].head())

                                                Body  \
0  Notice: This message was sent from outside the...   
1  Votre réponse a bien été prise en compte. [htt...   
2  Notice: This message was sent from outside the...   
3  Your UVIC account has been filed under the lis...   
4  Message generated from  uvic.ca source.   Send...   

                                              Emails  Num_Emails  \
0        [amuench@uvic.ca, hudsonesajoyce@gmail.com]           2   
1                                    [foipp@uvic.ca]           1   
2  [auwaluu.ma.r.bu.ba@googlemail.com, icon-Downl...           2   
3                                                 []           0   
4               [helpdesk@uvic.ca, helpdesk@uvic.ca]           2   

                                                URLs  Num_URLs  
0  [https://www.canadapost-postescanada.ca/cpc/as...         2  
1  [https://www.edigitalagency.com.au/wp-content/...         3  
2  [https://NA4.docusign.net/member/Images/email/...       

In [11]:
phishing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Subject     2576 non-null   object
 1   Body        2576 non-null   object
 2   Emails      2576 non-null   object
 3   URLs        2576 non-null   object
 4   Num_Emails  2576 non-null   int64 
 5   Num_URLs    2576 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 120.9+ KB
