In [1]:
import nltk
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"

In [2]:
phishing_data_path = os.path.join(project_root, 'data', 'raw', 'CaptstoneProjectData_2024.csv')

phishing_data = pd.read_csv(phishing_data_path)

print("Phishing Dataset Preview:")
print(phishing_data.head())
print("\nPhishing Dataset Info:")
print(phishing_data.info())

Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4          You have (6) Suspended incoming messages    

                                                Body  Unnamed: 2  Unnamed: 3  
0  Notice: This message was sent from outside the...         NaN         NaN  
1  \r\nVotre réponse a bien été prise en compte.\...         NaN         NaN  
2  Notice: This message was sent from outside the...         NaN         NaN  
3  Your UVIC account has been filed under the lis...         NaN         NaN  
4  \r\n\r\nMessage generated from  uvic.ca source...         NaN         NaN  

Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 4 columns):
 #   Column      Non-Null Count  

In [3]:
# Remove unnecessary columns
phishing_data.drop(columns=['Unnamed: 2', 'Unnamed: 3'], inplace=True)

# Handle missing values
phishing_data['Subject'].fillna('No Subject', inplace=True)  # Replace missing subjects with 'No Subject'
phishing_data['Body'].fillna('No Body', inplace=True)  # Replace missing bodies with 'No Body'

# Text cleaning
def clean_whitespace(text):
    # Remove leading/trailing whitespace, replace line breaks and other typical cleaning tasks
    text = text.strip().replace('\r\n', ' ').replace('\n', ' ')
    return text

phishing_data['Subject'] = phishing_data['Subject'].apply(clean_whitespace)
phishing_data['Body'] = phishing_data['Body'].apply(clean_whitespace)

# Preview the cleaned dataset
print("Cleaned Phishing Dataset Preview:")
print(phishing_data.head())
print("\nCleaned Phishing Dataset Info:")
print(phishing_data.info())

Cleaned Phishing Dataset Preview:
                                             Subject  \
0  ®Review your shipment details / Shipment Notif...   
1                            Υоur ассоunt іѕ оn hоld   
2  Completed: Invoice # KZ89TYS2564 from-Bestbuy....   
3                             UVic IMPORTANT NOTICE!   
4           You have (6) Suspended incoming messages   

                                                Body  
0  Notice: This message was sent from outside the...  
1  Votre réponse a bien été prise en compte. [htt...  
2  Notice: This message was sent from outside the...  
3  Your UVIC account has been filed under the lis...  
4  Message generated from  uvic.ca source.   Send...  

Cleaned Phishing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  2576 non-null   object
 1   Body     2576 non-null   object
dtypes: objec

In [4]:
# Define functions to extract emails and URLs and apply them to the DataFrame
def extract_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails

def extract_urls(text):
    url_pattern = r'\b(?:https?|ftp):\/\/\S+|\bwww\.\S+\.\S+'
    urls = re.findall(url_pattern, text)
    return urls

# Apply the functions to the dataset
phishing_data['Emails'] = phishing_data['Body'].apply(extract_emails)
phishing_data['URLs'] = phishing_data['Body'].apply(extract_urls)

# Calculate the number of emails and URLs
phishing_data['Num_Emails'] = phishing_data['Emails'].apply(len)
phishing_data['Num_URLs'] = phishing_data['URLs'].apply(len)

# Display the new features in the dataset
print(phishing_data[['Body', 'Emails', 'Num_Emails', 'URLs', 'Num_URLs']].head())

                                                Body  \
0  Notice: This message was sent from outside the...   
1  Votre réponse a bien été prise en compte. [htt...   
2  Notice: This message was sent from outside the...   
3  Your UVIC account has been filed under the lis...   
4  Message generated from  uvic.ca source.   Send...   

                                              Emails  Num_Emails  \
0        [amuench@uvic.ca, hudsonesajoyce@gmail.com]           2   
1                                    [foipp@uvic.ca]           1   
2  [auwaluu.ma.r.bu.ba@googlemail.com, icon-Downl...           2   
3                                                 []           0   
4               [helpdesk@uvic.ca, helpdesk@uvic.ca]           2   

                                                URLs  Num_URLs  
0  [https://www.canadapost-postescanada.ca/cpc/as...         2  
1  [https://www.edigitalagency.com.au/wp-content/...         3  
2  [https://NA4.docusign.net/member/Images/email/...       

In [5]:
phishing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Subject     2576 non-null   object
 1   Body        2576 non-null   object
 2   Emails      2576 non-null   object
 3   URLs        2576 non-null   object
 4   Num_Emails  2576 non-null   int64 
 5   Num_URLs    2576 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 120.9+ KB


In [8]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Remove non-text characters and numbers
    text = re.sub(r'\W+', ' ', text)  # Replace any non-word character (includes underscores) with space

    # Remove digits
    text = re.sub(r'\d+', ' ', text)  # Replace all digits with space

    # Remove any long repeated characters (e.g., "__________", "-----", etc.)
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Collapse extended repetitions of the same character

    # Convert to lower case
    text = text.lower().strip()

    # Remove stop words and short words
    text = ' '.join([word for word in text.split() if len(word) > 2])

    return text

# Clean your data
phishing_data['Body'] = phishing_data['Body'].apply(clean_text)

In [9]:
# Initialize the TF-IDF Vectorizer without including stop words (since we've already removed them)
tfidf_vectorizer = TfidfVectorizer(stop_words=None)  # Stop words are already removed

# Fit and transform the 'Body' text
tfidf_features = tfidf_vectorizer.fit_transform(phishing_data['Body'])

# Convert to a DataFrame (optional, for better integration with Pandas)
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Preview the TF-IDF features DataFrame
print(tfidf_df.head())

   __alfywrlczlyoc_m  _assets_logo_metamask  _base  _blackfridaybanner  \
0                0.0                    0.0    0.0                 0.0   
1                0.0                    0.0    0.0                 0.0   
2                0.0                    0.0    0.0                 0.0   
3                0.0                    0.0    0.0                 0.0   
4                0.0                    0.0    0.0                 0.0   

   _blankview  _bt  _ca  _capellauniversity_edu  _cb  _class_hdr_  ...  \
0         0.0  0.0  0.0                     0.0  0.0          0.0  ...   
1         0.0  0.0  0.0                     0.0  0.0          0.0  ...   
2         0.0  0.0  0.0                     0.0  0.0          0.0  ...   
3         0.0  0.0  0.0                     0.0  0.0          0.0  ...   
4         0.0  0.0  0.0                     0.0  0.0          0.0  ...   

   日赢得了价值  确认我的帐户确认我的帐户确认我的帐户确认我的帐户  美元的捐款  送信者に直ちにその旨返信し  通过以下方式直接联系我进行索赔  \
0     0.0                       

In [10]:
# Define the root directory of your project
project_root = "/Users/feiyixie/Projects/Summer-2024-ECE-597-Group8"

# Define the path for saving the CSV within your project structure
output_path = os.path.join(project_root, 'data', 'processed', 'tfidf_feature_names.csv')

# Export the column names to a CSV file
columns_df = pd.DataFrame(tfidf_df.columns, columns=['Feature Names'])
columns_df.to_csv(output_path, index=False)

print("Feature names exported successfully to:", output_path)

Feature names exported successfully to: /Users/feiyixie/Projects/Summer-2024-ECE-597-Group8/data/processed/tfidf_feature_names.csv
