In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# === Load labeled data
df = pd.read_csv('../data/labeled_emails.csv')

# === Dynamic reply templates with placeholders
reply_templates = {
    'meeting_request': "Dear {name}, Thank you for your meeting request. I am available at {time} on {date} at {location}. Looking forward to our discussion.",
    'complaint': "Dear {name}, We sincerely apologize for the inconvenience caused. We will investigate the issue and get back to you shortly.",
    'social': "Dear {name}, It’s always a pleasure to hear from you! Let’s catch up soon — perhaps at {location} around {time}?",
    'task_update': "Dear {name}, Thank you for the update. I’ll review the task and respond before {date}.",
    'general': "Dear {name}, Thank you for your email. I’ll get back to you as soon as I can."
}

# === Simulate extracted entity fields (For demo, use dummy values)
def simulate_entity_fill(row):
    return {
        'name': 'John',
        'date': 'Monday',
        'time': '10 AM',
        'location': 'Main Conference Room'
    }

# === Format reply with placeholders
def format_reply(label):
    placeholders = simulate_entity_fill(None)
    template = reply_templates.get(label, reply_templates['general'])
    return template.format(**placeholders)

df['reply'] = df['label'].apply(format_reply)

# === Build input text with structure
df['input_text'] = "Email: " + df['email_text'] + " | Intent: " + df['label']

# === Dataset class for T5
class EmailReplyDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ = self.inputs[idx]
        target = self.targets[idx]
        
        input_enc = self.tokenizer.encode_plus(
            input_, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer.encode_plus(
            target, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )

        return {
            'input_ids': input_enc['input_ids'].squeeze(),
            'attention_mask': input_enc['attention_mask'].squeeze(),
            'labels': target_enc['input_ids'].squeeze()
        }

# === Split data
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    df['input_text'].values, df['reply'].values, test_size=0.2, random_state=42
)

# === Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# === Create datasets
train_dataset = EmailReplyDataset(train_inputs, train_targets, tokenizer)
val_dataset = EmailReplyDataset(val_inputs, val_targets, tokenizer)

# === Define training args
training_args = TrainingArguments(
    output_dir='../models/reply_generator',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='../logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    save_total_limit=2,
)

# === Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# === Train model
trainer.train()

# === Save model
model.save_pretrained('../models/reply_generator')
tokenizer.save_pretrained('../models/reply_generator')

print("✅ Enhanced Reply Generator Training Complete & Model Saved!")
