In [None]:
# notebooks/1_preprocessing.ipynb

import pandas as pd
import re

# Load dataset
df = pd.read_csv('../data/raw_enron.csv')

# Inspect columns (usually: 'subject', 'message', ...)
print(df.columns)

# Combine subject + message as email_text
df['email_text'] = df['subject'].fillna('') + ' ' + df['message'].fillna('')

# Basic cleaning function
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)  # Remove newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['email_text'] = df['email_text'].apply(clean_text)

# For demo, let's create a small sample with labels manually (simulate)
sample = df[['email_text']].sample(500, random_state=42).copy()

# Manually add labels for demo (simulate a label column with random categories)
import random
labels = ['meeting_request', 'complaint', 'social', 'task_update', 'general']
sample['label'] = [random.choice(labels) for _ in range(len(sample))]

# Save labeled data for training
sample.to_csv('../data/labeled_emails.csv', index=False)

print("Preprocessing Done! Labeled data saved.")
