# Below is a representative Jupyter Notebook outline (in Python code blocks). It demonstrates reading raw emails, cleaning text, and splitting into train/test sets.

# 1. Imports

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')

# For advanced Hebrew-specific processing, we will consider specialized libraries or rely on transformers' tokenization.

In [None]:
from sklearn.model_selection import train_test_split

# 2. Load Raw Data

In [None]:
# Supposeing we have a CSV with columns: ['subject', 'body', 'category', 'priority']
raw_data_path = "../data/raw/emails.csv"
df = pd.read_csv(raw_data_path)

# 3. Basic Cleanup

In [None]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase for English. For Hebrew, we will handle differently.
    text = text.lower()
    return text

df['subject_clean'] = df['subject'].apply(clean_text)
df['body_clean']    = df['body'].apply(clean_text)

# Combine subject + body into one text field
df['full_text'] = df['subject_clean'] + " " + df['body_clean']

# 4. Train/Test Split

In [None]:
# We'll assume categories: ["HR", "Finance", "Support", ...] and priorities: ["High", "Medium", "Low"]
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['category'])

# 5. Save Processed Datasets

In [None]:
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("Data preparation completed. Processed files saved to ../data/processed/")