In [None]:
# notebooks/3_reply_generator.ipynb

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load labeled emails
df = pd.read_csv('../data/labeled_emails.csv')

# For reply generation, we need input-output pairs (email -> reply)
# Since Enron dataset does not have reply texts,
# we'll simulate replies by creating template-based replies for each category.
# In a real project, you'd use a dataset with email-reply pairs or manually annotate some.

reply_templates = {
    'meeting_request': "Dear sender, Thank you for your meeting request. I am available at your suggested time.",
    'complaint': "Dear sender, We apologize for the inconvenience. We will look into this issue immediately.",
    'social': "Dear sender, It was great to hear from you. Looking forward to catching up soon.",
    'task_update': "Dear sender, Thanks for the update. I will review and get back to you.",
    'general': "Dear sender, Thank you for your email. I will respond shortly."
}

# Create prompt input and target reply text
df['reply'] = df['label'].apply(lambda x: reply_templates.get(x, reply_templates['general']))

# Create T5-style inputs: "Email: <email_text> Category: <label>"
df['input_text'] = "Email: " + df['email_text'] + " Category: " + df['label']

# Dataset class
class EmailReplyDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ = self.inputs[idx]
        target = self.targets[idx]
        
        input_enc = self.tokenizer.encode_plus(
            input_, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer.encode_plus(
            target, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )

        return {
            'input_ids': input_enc['input_ids'].squeeze(),
            'attention_mask': input_enc['attention_mask'].squeeze(),
            'labels': target_enc['input_ids'].squeeze()
        }

# Split data
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    df['input_text'].values, df['reply'].values, test_size=0.2, random_state=42
)

# Initialize tokenizer and datasets
tokenizer = T5Tokenizer.from_pretrained('t5-small')
train_dataset = EmailReplyDataset(train_inputs, train_targets, tokenizer)
val_dataset = EmailReplyDataset(val_inputs, val_targets, tokenizer)

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Training arguments
training_args = TrainingArguments(
    output_dir='../models/reply_generator',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='../logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    save_total_limit=2,
)

# Define Trainer (no custom metrics here since generation eval is complex)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train
trainer.train()

# Save model and tokenizer
model.save_pretrained('../models/reply_generator')
tokenizer.save_pretrained('../models/reply_generator')

print("Reply Generator Training Complete & Model Saved!")
