In [10]:
import os
import re
import pandas as pd
from email import message_from_file
from email.utils import parseaddr,getaddresses
from email.header import decode_header, make_header
from bs4 import BeautifulSoup
from urllib.parse import urlparse

In [11]:
def extract_urls(text):
    url_pattern = r'\b(?:https?://|www\.)\S+\b'
    return re.findall(url_pattern, text)

In [12]:
def safe_decode(payload, charset):
    """
    Safely decode bytes using the given charset.
    Falls back to UTF-8 if charset is unknown or invalid.
    """
    try:
        return payload.decode(charset or "utf-8", errors="ignore")
    except (LookupError, TypeError):
        # If charset is unknown or None, fallback to utf-8
        return payload.decode("utf-8", errors="ignore")


In [13]:
def extract_data(path):
    with open(path,"r",encoding="utf-8",errors="ignore") as f:
        msg=message_from_file(f)
    name,addr=parseaddr(msg.get("From",""))
    subject=str(make_header(decode_header(msg.get("Subject",""))))

    body=""
    if msg.is_multipart():
        for part in msg.walk():
            c_type=part.get_content_type()
            disp=part.get_content_disposition()

            if c_type=="text/plain" and disp is None:
                body = safe_decode(part.get_payload(decode=True), part.get_content_charset())
                break
            
            elif c_type=="text/html" and not body:
                html = safe_decode(part.get_payload(decode=True), part.get_content_charset())
                body=BeautifulSoup(html,"html.parser").get_text(" ",strip=True)

    else:
        body = safe_decode(msg.get_payload(decode=True), msg.get_content_charset())



    urls=extract_urls(body)
    has_url=1 if len(urls)>0 else 0
    num_urls=len(urls)

    cc_list=getaddresses(msg.get_all("Cc",[]))
    bcc_list=getaddresses(msg.get_all("Bcc",[]))
    num_cc=len(cc_list)
    num_bcc=len(bcc_list)

    sender_domain = addr.split("@")[-1] if "@" in addr else ""

    data_dict = {
        "From": addr,
        "Subject": subject,
        "Body": body,
        "URLs": urls,
        "SenderDomain": sender_domain,
        "HasURL": has_url,
        "NumURLs": num_urls,
        "NumCC": num_cc,
        "NumBCC": num_bcc
    }

    return data_dict


In [14]:
import tldextract

def url_data(url_list):
    url_dicts=[]

    for url in url_list:
        extracted=tldextract.extract(url)
        domain=extracted.domain
        sub_domain=extracted.subdomain
        tld=extracted.suffix
        path=url.split(f"{domain}.{tld}")[-1] if domain and tld else ""

        url_info={
            "URL": url,
            "Domain": domain,
            "Subdomain": sub_domain,
            "TLD": tld,
            "Path": path,
            "Length": len(url),
            "NumSpecialChars": sum(1 for c in url if not c.isalnum())
        }

        url_dicts.append(url_info)
        
    return url_dicts

In [15]:
path="F:/college/bca_college/sixth sem/project/code/data_files/easy_ham/0010.4996141de3f21e858c22f88231a9f463"

email=extract_data(path)
url=url_data(email["URLs"])

In [16]:
folders = [
    ("F:/college/bca_college/sixth sem/project/code/data_files/spam_2", "spam"),
    ("F:/college/bca_college/sixth sem/project/code/data_files/easy_ham", "ham"),
    ("F:/college/bca_college/sixth sem/project/code/data_files/hard_ham", "ham")
]

email_data_list = []
url_data_list = []

for folder_path, label in folders:
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if os.path.isfile(filepath):
            try:
                # Extract email-level data
                email_dict = extract_data(filepath)
                email_dict_copy = {k:v for k,v in email_dict.items() if k != "URLs"}
                email_dict_copy["Label"] = label
                email_data_list.append(email_dict_copy)
                
                # Extract URL-level data
                urls = email_dict.get("URLs", [])
                url_dicts = url_data(urls)
                for u in url_dicts:
                    u["Label"] = label
                url_data_list.extend(url_dicts)
                
            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Create DataFrames
df_emails = pd.DataFrame(email_data_list)
df_urls = pd.DataFrame(url_data_list)

# Save CSVs
df_emails.to_csv("email_dataset.csv", index=False)
df_urls.to_csv("url_dataset.csv", index=False)

print("Email DataFrame shape:", df_emails.shape)
print("URL DataFrame shape:", df_urls.shape)


Email DataFrame shape: (4189, 9)
URL DataFrame shape: (30596, 8)
