In [None]:
import pandas as pd
import spacy
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from random import choice
import torch

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Simple synonym dictionary for augmentation
synonym_dict = {
    'meeting': ['appointment', 'conference', 'session'],
    'schedule': ['plan', 'arrange', 'organize'],
    'issue': ['problem', 'concern', 'trouble'],
    'thanks': ['thank', 'appreciate', 'gratitude'],
    'update': ['progress', 'report', 'status']
}

def simple_augment(text):
    if not isinstance(text, str) or not text.strip():
        return text
    words = text.split()
    for i, word in enumerate(words):
        if word in synonym_dict and np.random.rand() < 0.3:  # 30% chance to replace
            words[i] = choice(synonym_dict[word])
    return ' '.join(words)

# Load dataset (process in chunks for 500,000+ emails)
chunk_size = 10000
df_chunks = pd.read_csv('../data/emails.csv', chunksize=chunk_size)

# Initialize lists to store processed data
email_texts = []
entities_list = []

for chunk in df_chunks:
    # Combine subject and message
    chunk['email_text'] = chunk['file'].fillna('') + ' ' + chunk['message'].fillna('')
    
    # Advanced cleaning function with entity extraction
    def clean_text(text):
        if not isinstance(text, str):
            return '', {}
        text = re.sub(r'\n+', ' ', text)  # Remove newlines
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        doc = nlp(text.lower())
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        entities = {ent.label_: ent.text for ent in doc.ents if ent.label_ in ["PERSON", "DATE", "TIME"]}
        return ' '.join(tokens), entities

    chunk[['cleaned_text', 'entities']] = chunk['email_text'].apply(lambda x: pd.Series(clean_text(x)))
    email_texts.extend(chunk['cleaned_text'].tolist())
    entities_list.extend(chunk['entities'].tolist())
    
    break  # Process only first chunk

# Zero-shot classification for labeling
model = SentenceTransformer('all-MiniLM-L6-v2')
intents = ['meeting_request', 'complaint', 'social', 'task_update', 'general']
intent_descriptions = [
    'Emails requesting to schedule or arrange a meeting or appointment.',
    'Emails expressing dissatisfaction, issues, or complaints.',
    'Emails with casual greetings, thanks, or social interactions.',
    'Emails providing updates or progress on tasks or projects.',
    'Emails with miscellaneous or unspecified content.'
]

# Encode intents and emails
intent_embeddings = model.encode(intent_descriptions, convert_to_tensor=True)
email_embeddings = model.encode(email_texts[:chunk_size], convert_to_tensor=True, show_progress_bar=True)

# Compute cosine similarities
similarities = torch.zeros((len(email_texts[:chunk_size]), len(intents)))
for i in range(len(email_texts[:chunk_size])):
    similarities[i] = torch.nn.functional.cosine_similarity(
        email_embeddings[i].unsqueeze(0), intent_embeddings, dim=1
    )
labels = [intents[idx] for idx in similarities.argmax(dim=1).cpu().numpy()]

# Create DataFrame
df_sample = pd.DataFrame({
    'email_text': email_texts[:chunk_size],
    'label': labels,
    'entities': entities_list[:chunk_size]
}).reset_index(drop=True)  # Ensure unique index

# Apply augmentation
df_sample['augmented_text'] = df_sample['email_text'].apply(simple_augment)

# Create augmented DataFrame and reset index
df_augmented = df_sample[['augmented_text', 'label', 'entities']].rename(columns={'augmented_text': 'email_text'}).reset_index(drop=True)

# Concatenate original and augmented DataFrames
df_sample = pd.concat([df_sample[['email_text', 'label', 'entities']], df_augmented]).reset_index(drop=True)

# Save labeled data
df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle
df_sample.to_csv('../data/labeled_emails.csv', index=False)

print("Preprocessing Done! Labeled data saved with", len(df_sample), "samples.")