In [None]:
import os
import re
import pandas as pd
from urllib.parse import urlparse

# --- CONFIGURATION OF HEURISTICS ---
# You can add or remove keywords and TLDs here
URGENCY_KEYWORDS = [
    'action required', 'urgent', 'immediate', 'verify', 'confirm', 'account',
    'suspended', 'locked', 'restricted', 'security', 'unusual', 'sign-in',
    'password', 'update', 'dispute', 'limited'
]

SUSPICIOUS_TLDS = ['.xyz', '.top', '.club', '.buzz', '.monster', '.online', '.info']

# --- HELPER FUNCTIONS FOR PARSING ---

def parse_email_file(file_path):
    """
    Parses a single raw email text file into a dictionary.
    This function is designed to be robust and handle missing sections.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    email_data = {}
    sections = re.split(r'=== [A-Z\s]+ ===', content)
    headers = re.findall(r'=== ([A-Z\s]+) ===', content)

    # Put the content into a dictionary with section headers as keys
    section_map = {header.strip(): section.strip() for header, section in zip(headers, sections[1:])}

    # --- Extracting Basic Information ---
    basic_info = section_map.get('BASIC INFORMATION', '')
    email_data['subject'] = re.search(r'Subject: (.*)', basic_info, re.IGNORECASE)
    email_data['sender'] = re.search(r'Sender: (.*)', basic_info, re.IGNORECASE)
    email_data['sender_email'] = re.search(r'Sender Email: (.*)', basic_info, re.IGNORECASE)

    # Safely extract values, defaulting to empty strings if not found
    email_data['subject'] = email_data['subject'].group(1).strip() if email_data['subject'] else ''
    email_data['sender'] = email_data['sender'].group(1).strip() if email_data['sender'] else ''
    email_data['sender_email'] = email_data['sender_email'].group(1).strip() if email_data['sender_email'] else ''

    # --- Extracting Attachments ---
    attachment_info = section_map.get('ATTACHMENTS ANALYSIS', '')
    email_data['attachments'] = re.findall(r'Name: (.*)', attachment_info)

    # --- Extracting Body ---
    email_data['body'] = section_map.get('EMAIL BODY', '')

    return email_data


# --- HEURISTIC FUNCTIONS (FEATURE EXTRACTORS) ---
# Each function returns a 1 (suspicious) or 0 (not suspicious), or a count.

def feature_sender_name_mismatch(sender, sender_email):
    """Checks if the sender's name seems inconsistent with the email address."""
    if not sender or not sender_email:
        return 0
    # Simple check: does a part of the sender's name appear in the domain?
    # e.g., "PayPal" in "service@paypal.com" -> OK
    # e.g., "PayPa1" in "no-reply@paypa1.com" -> OK (catches simple spoofs)
    # e.g., "PayPal" in "no-reply@secure-login.com" -> Suspicious
    try:
        domain = sender_email.split('@')[1]
        sender_parts = re.split(r'\s+', sender.lower())
        for part in sender_parts:
            if len(part) > 2 and part in domain.lower():
                return 0 # Found a match, likely not a mismatch
        return 1 # No part of the sender name was in the domain
    except IndexError:
        return 1 # Malformed email address


def feature_suspicious_sender_domain(sender_email):
    """Checks for common tricks in sender domains, like '1' for 'l'."""
    if not sender_email:
        return 0
    # Example: paypa1.com instead of paypal.com
    if 'paypa1' in sender_email.lower() or 'g00gle' in sender_email.lower():
        return 1
    return 0


def feature_urgency_keyword_count(subject, body):
    """Counts how many urgency-inducing keywords are present."""
    text = (subject + ' ' + body).lower()
    count = sum(1 for keyword in URGENCY_KEYWORDS if keyword in text)
    return count


def feature_attachment_is_executable(attachments):
    """Checks if any attachment has a dangerous extension like .exe or .zip."""
    if not attachments:
        return 0
    for attachment in attachments:
        if attachment.lower().endswith(('.exe', '.zip', '.scr', '.msi', '.bat')):
            return 1
    return 0


def feature_body_has_no_text(body):
    """Checks for image-only emails, a common phishing tactic."""
    # This is a simplified check. A more advanced one would analyze HTML.
    if body and len(body.strip()) < 50: # Arbitrary short length
        return 1
    return 0


# --- MAIN PROCESSING SCRIPT ---

def process_emails_to_csv():
    """Main function to run the entire preprocessing pipeline."""
    print("--- Email Feature Extraction Script ---")
    
    # 1. Get user input for paths
    input_dir = input("➡️ Enter the full path to the directory containing your email text files: ")
    output_csv_path = input("➡️ Enter the full path where you want to save the output CSV file (e.g., /path/to/features.csv): ")

    if not os.path.isdir(input_dir):
        print(f"❌ Error: The directory '{input_dir}' does not exist.")
        return

    all_features = []
    filenames = []

    # 2. Iterate through all files in the directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"): # Assuming files are .txt
            file_path = os.path.join(input_dir, filename)
            try:
                email_data = parse_email_file(file_path)
                
                # 3. Apply all heuristic functions to get the feature vector
                features = {
                    'sender_name_mismatch': feature_sender_name_mismatch(email_data['sender'], email_data['sender_email']),
                    'suspicious_sender_domain': feature_suspicious_sender_domain(email_data['sender_email']),
                    'urgency_keyword_count': feature_urgency_keyword_count(email_data['subject'], email_data['body']),
                    'attachment_is_executable': feature_attachment_is_executable(email_data['attachments']),
                    'body_has_no_text': feature_body_has_no_text(email_data['body']),
                    # Add more feature function calls here as you create them
                }
                all_features.append(features)
                filenames.append(filename)

            except Exception as e:
                print(f"⚠️ Warning: Could not process file '{filename}'. Error: {e}")

    if not all_features:
        print("❌ Error: No email files were processed. Please check the directory and file format.")
        return

    # 4. Create a Pandas DataFrame and save to CSV
    df = pd.DataFrame(all_features)
    df.insert(0, 'filename', filenames) # Add filenames as the first column for reference
    
    df.to_csv(output_csv_path, index=False)
    
    print(f"\n✅ Success! Processed {len(all_features)} emails.")
    print(f"Feature data has been saved to: {output_csv_path}")
    print("\nPreview of the data:")
    print(df.head())

# --- RUN THE SCRIPT ---
# To execute, just run this cell in your Jupyter Notebook.
process_emails_to_csv()
