### Load Data and Intent Classifier

In [2]:
import pandas as pd
import os
import torch
import pickle
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "entities"]].dropna()
print(f"Loaded {len(df)} emails.")

intent_model_dir = "../models/intent_classifier"
intent_tokenizer = DistilBertTokenizerFast.from_pretrained(intent_model_dir)
intent_model = DistilBertForSequenceClassification.from_pretrained(intent_model_dir).to(device)

with open(os.path.join(intent_model_dir, "label_encoder.pkl"), "rb") as f:
    label_encoder = pickle.load(f)
intent_labels = label_encoder.classes_
print(f"Intent classifier loaded with labels: {list(intent_labels)}")

Using device: cpu
Loaded 85110 emails.
Intent classifier loaded with labels: ['Appreciation', 'Complaint', 'Data Request', 'Event Planning', 'Farewell', 'Finance', 'General Inquiry', 'Greeting', 'Job Inquiry', 'Legal', 'Meeting Request', 'Personal', 'Project Update', 'Reminder', 'Sales Inquiry', 'Technical Support']


### Intent Prediction and Fallback

In [3]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import json
import tqdm
tqdm.tqdm.pandas()  # Enable progress_apply

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
intent_model_dir = "../models/intent_classifier"
intent_tokenizer = DistilBertTokenizerFast.from_pretrained(intent_model_dir)
intent_model = DistilBertForSequenceClassification.from_pretrained(intent_model_dir).to(device)
with open(os.path.join(intent_model_dir, "label_encoder.pkl"), "rb") as f:
    label_encoder = pickle.load(f)
intent_labels = label_encoder.classes_

def predict_intent(text):
    inputs = intent_tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt").to(device)
    intent_model.eval()
    with torch.no_grad():
        outputs = intent_model(**inputs)
    logits = outputs.logits
    pred_id = torch.argmax(logits, dim=1).item()
    pred_label = label_encoder.inverse_transform([pred_id])[0]
    return pred_label

def parse_entities(entity_str):
    try:
        ent_dict = json.loads(entity_str.replace("'", '"')) if isinstance(entity_str, str) else {}
        return {k: v for k, v in ent_dict.items() if v}
    except Exception as e:
        return {}

INTENT_KEYWORDS = {
    "Meeting Request": ["schedule", "meeting", "calendar", "call", "appointment"],
    "Job Inquiry": ["job", "resume", "position", "career", "apply"],
    "Finance": ["invoice", "payment", "amount", "fund", "salary", "finance"],
    "Legal": ["contract", "agreement", "terms", "clause", "lawyer"],
    "Appreciation": ["thank", "thanks", "grateful", "appreciate", "gratitude"],
    "Complaint": ["issue", "problem", "complaint", "error", "concern"],
    "Technical Support": ["bug", "support", "crash", "error", "fix", "install"],
    "Data Request": ["send", "forward", "email", "attach", "request"],
    "Greeting": ["hello", "hi", "greetings", "good morning"],
    "Farewell": ["regards", "bye", "sincerely", "take care"],
    "Sales Inquiry": ["quote", "pricing", "discount", "offer", "deal"],
    "Project Update": ["progress", "update", "status", "report"],
    "Reminder": ["remind", "deadline", "follow up", "due"],
    "Event Planning": ["venue", "event", "conference", "webinar", "party"],
    "Personal": ["family", "friend", "wedding", "vacation", "holiday"]
}

def fallback_intent(text):
    text = text.lower()
    for label, keywords in INTENT_KEYWORDS.items():
        if any(kw in text for kw in keywords):
            return label
    return "General Inquiry"

df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "entities"]].dropna()
df["parsed_entities"] = df["entities"].apply(parse_entities)
print("Predicting intents on dataset (this may take a while)...")
df["predicted_intent"] = df["clean_body"].progress_apply(predict_intent)
df["predicted_intent"] = df.apply(
    lambda row: row["predicted_intent"] if row["predicted_intent"] in intent_labels else fallback_intent(row["clean_body"]),
    axis=1
)

Predicting intents on dataset (this may take a while)...


100%|██████████| 85110/85110 [2:02:08<00:00, 11.61it/s]  


### Dynamic Reply Generation

In [5]:
import pandas as pd
import random
import json

# -------------------- Parse Named Entities --------------------
def parse_entities(entity_str):
    try:
        ent_dict = json.loads(entity_str.replace("'", '"')) if isinstance(entity_str, str) else {}
        return {k: v for k, v in ent_dict.items() if v}
    except Exception as e:
        return {}

# -------------------- Intent Keyword Mapping (not used directly here, for reference) --------------------
INTENT_KEYWORDS = {
    "Meeting Request": ["schedule", "meeting", "calendar", "call", "appointment"],
    "Job Inquiry": ["job", "resume", "position", "career", "apply"],
    "Finance": ["invoice", "payment", "amount", "fund", "salary", "finance"],
    "Legal": ["contract", "agreement", "terms", "clause", "lawyer"],
    "Appreciation": ["thank", "thanks", "grateful", "appreciate", "gratitude"],
    "Complaint": ["issue", "problem", "complaint", "error", "concern"],
    "Technical Support": ["bug", "support", "crash", "error", "fix", "install"],
    "Data Request": ["send", "forward", "email", "attach", "request"],
    "Greeting": ["hello", "hi", "greetings", "good morning"],
    "Farewell": ["regards", "bye", "sincerely", "take care"],
    "Sales Inquiry": ["quote", "pricing", "discount", "offer", "deal"],
    "Project Update": ["progress", "update", "status", "report"],
    "Reminder": ["remind", "deadline", "follow up", "due"],
    "Event Planning": ["venue", "event", "conference", "webinar", "party"],
    "Personal": ["family", "friend", "wedding", "vacation", "holiday"]
}

# -------------------- Target Generator --------------------
def build_target(row):
    body = row["clean_body"].lower()
    label = row["label"]  # 🔄 using ground truth intent
    entities = row["parsed_entities"]

    name = None
    if entities and isinstance(entities.get("PERSON"), list) and entities["PERSON"]:
        name = entities["PERSON"][0].split()[0]

    greeting = f"Dear {name}," if name else "Hi there,"

    if "thank" in body:
        return f"{greeting}\nYou're most welcome. Let me know if you need anything else."

    elif label == "Meeting Request":
        date = entities.get("DATE", [])
        when = f" on {date[0]}" if date else ""
        return f"{greeting}\nThanks for your message. I'm available for a meeting{when}. Please suggest a convenient time."

    elif label == "Job Inquiry":
        return f"{greeting}\nThank you for your interest. We’ll review your profile and get back to you soon."

    elif label == "Finance" or label == "Invoice" or "invoice" in body or "payment" in body:
        return f"{greeting}\nI've received your invoice. Our finance team will process it shortly."

    elif label == "Complaint":
        return f"{greeting}\nI'm sorry to hear that you've encountered an issue. We’re actively looking into it and will follow up soon."

    elif label == "Appreciation":
        return f"{greeting}\nWe truly appreciate your kind words. It means a lot to our team."

    elif label == "Technical Support":
        return f"{greeting}\nThanks for reporting this issue. Our tech team is looking into it and will provide an update shortly."

    elif label == "Data Request":
        org = entities.get("ORG", [])
        org_info = f" from {org[0]}" if org else ""
        return f"{greeting}\nThanks for your request{org_info}. I’ll send the required data shortly."

    elif label == "Sales Inquiry":
        return f"{greeting}\nThank you for your interest in our services. I’ll send over pricing and available offers shortly."

    elif label == "Project Update":
        return f"{greeting}\nHere’s the latest project update. Let me know if you need further details."

    elif label == "Reminder":
        return f"{greeting}\nJust a quick reminder regarding the upcoming deadline. Please confirm if everything is on track."

    elif label == "Event Planning":
        location = entities.get("GPE", [])
        place = f" in {location[0]}" if location else ""
        return f"{greeting}\nLooking forward to the upcoming event{place}. Let me know how I can help with the planning."

    elif label == "Farewell":
        return f"{greeting}\nWishing you all the best in your next chapter. Stay in touch!"

    elif label == "Greeting":
        return f"{greeting}\nHope you're having a great day! Let me know how I can assist you."

    elif label == "Legal":
        return f"{greeting}\nWe’ve received your legal documents. Our legal advisor will review and respond soon."

    elif label == "General Inquiry":
        responses = [
            f"{greeting}\nThank you for your email. I’ll get back to you with more details shortly.",
            f"{greeting}\nI appreciate your inquiry. I’ll look into it and respond as soon as possible.",
            f"{greeting}\nThanks for reaching out. Let me check and get back to you."
        ]
        return random.choice(responses)

    elif label == "Personal":
        return f"{greeting}\nThank you for the personal update. Wishing you all the best!"

    else:
        return f"{greeting}\nThank you for your message. I will get back to you shortly."

# -------------------- Prompt Generator --------------------
def build_prompt(row):
    intent = row["label"]  # 🔄 using ground truth intent
    entities = row["parsed_entities"]
    email_text = row["clean_body"]

    person_names = entities.get("PERSON", [])
    recipient_name = person_names[0] if person_names else "Unknown"

    entities_str = " | ".join(f"{k}: {', '.join(v)}" for k, v in entities.items()) if entities else "None"

    prompt = f"Intent: {intent} | RecipientName: {recipient_name} | Entities: {entities_str} | Email: {email_text}"
    return prompt

# -------------------- Load and Process Data --------------------
df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "label", "entities"]].dropna()
df["parsed_entities"] = df["entities"].apply(parse_entities)
df["prompt"] = df.apply(build_prompt, axis=1)
df["target"] = df.apply(build_target, axis=1)

# ✅ Output preview
df[["prompt", "target"]].head()


Unnamed: 0,prompt,target
0,Intent: Data Request | RecipientName: Unknown ...,"Hi there,\nThanks for your request from Enron ..."
1,Intent: General Inquiry | RecipientName: Unkno...,"Hi there,\nThanks for reaching out. Let me che..."
2,Intent: Meeting Request | RecipientName: Carr ...,"Dear Carr,\nThanks for your message. I'm avail..."
3,Intent: Job Inquiry | RecipientName: Unknown |...,"Hi there,\nThank you for your interest. We’ll ..."
4,Intent: Finance | RecipientName: Bert Meyers |...,"Dear Bert,\nI've received your invoice. Our fi..."


### Data Preparation

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
import json

# Parse entities again if running independently
def parse_entities(entity_str):
    try:
        ent_dict = json.loads(entity_str.replace("'", '"')) if isinstance(entity_str, str) else {}
        return {k: v for k, v in ent_dict.items() if v}
    except Exception:
        return {}

# Load data with generated prompts & targets
df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "label", "entities"]].dropna()
df["parsed_entities"] = df["entities"].apply(parse_entities)

# Create prompts and targets
def build_prompt(row):
    intent = row["label"]
    entities = row["parsed_entities"]
    email_text = row["clean_body"]
    person_names = entities.get("PERSON", [])
    recipient_name = person_names[0] if person_names else "Unknown"
    entities_str = " | ".join(f"{k}: {', '.join(v)}" for k, v in entities.items()) if entities else "None"
    return f"Intent: {intent} | RecipientName: {recipient_name} | Entities: {entities_str} | Email: {email_text}"

def build_target(row):
    body = row["clean_body"].lower()
    label = row["label"]
    entities = row["parsed_entities"]
    name = None
    if entities and isinstance(entities.get("PERSON"), list) and entities["PERSON"]:
        name = entities["PERSON"][0].split()[0]
    greeting = f"Dear {name}," if name else "Hi there,"

    if "thank" in body:
        return f"{greeting}\nYou're most welcome. Let me know if you need anything else."

    if label == "Meeting Request":
        date = entities.get("DATE", [])
        when = f" on {date[0]}" if date else ""
        return f"{greeting}\nThanks for your message. I'm available for a meeting{when}. Please suggest a convenient time."

    elif label == "Job Inquiry":
        return f"{greeting}\nThank you for your interest. We’ll review your profile and get back to you soon."

    elif label in ["Finance", "Invoice"] or "invoice" in body or "payment" in body:
        return f"{greeting}\nI've received your invoice. Our finance team will process it shortly."

    elif label == "Complaint":
        return f"{greeting}\nI'm sorry to hear that you've encountered an issue. We’re actively looking into it and will follow up soon."

    elif label == "Appreciation":
        return f"{greeting}\nWe truly appreciate your kind words. It means a lot to our team."

    elif label == "Technical Support":
        return f"{greeting}\nThanks for reporting this issue. Our tech team is looking into it and will provide an update shortly."

    elif label == "Data Request":
        org = entities.get("ORG", [])
        org_info = f" from {org[0]}" if org else ""
        return f"{greeting}\nThanks for your request{org_info}. I’ll send the required data shortly."

    elif label == "Sales Inquiry":
        return f"{greeting}\nThank you for your interest in our services. I’ll send over pricing and available offers shortly."

    elif label == "Project Update":
        return f"{greeting}\nHere’s the latest project update. Let me know if you need further details."

    elif label == "Reminder":
        return f"{greeting}\nJust a quick reminder regarding the upcoming deadline. Please confirm if everything is on track."

    elif label == "Event Planning":
        location = entities.get("GPE", [])
        place = f" in {location[0]}" if location else ""
        return f"{greeting}\nLooking forward to the upcoming event{place}. Let me know how I can help with the planning."

    elif label == "Farewell":
        return f"{greeting}\nWishing you all the best in your next chapter. Stay in touch!"

    elif label == "Greeting":
        return f"{greeting}\nHope you're having a great day! Let me know how I can assist you."

    elif label == "Legal":
        return f"{greeting}\nWe’ve received your legal documents. Our legal advisor will review and respond soon."

    elif label == "Personal":
        return f"{greeting}\nThank you for the personal update. Wishing you all the best!"

    else:
        return f"{greeting}\nThank you for your message. I will get back to you shortly."

df["prompt"] = df.apply(build_prompt, axis=1)
df["target"] = df.apply(build_target, axis=1)

# Sample for training
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_data(df_split):
    inputs = tokenizer(
        list(df_split["prompt"]),
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    targets = tokenizer(
        list(df_split["target"]),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    return inputs, targets

train_inputs, train_targets = tokenize_data(train_df)
val_inputs, val_targets = tokenize_data(val_df)


### Dataset and Model Setup

In [8]:
import torch
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

class ReplyDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return self.inputs["input_ids"].size(0)

    def __getitem__(self, idx):
        input_ids = self.inputs["input_ids"][idx]
        attention_mask = self.inputs["attention_mask"][idx]
        labels = self.targets["input_ids"][idx].clone()
        labels[labels == tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

train_dataset = ReplyDataset(train_inputs, train_targets)
val_dataset = ReplyDataset(val_inputs, val_targets)

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

### Training and Evaluation

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/reply_generator",
    logging_dir="../logs",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
    do_eval=True  # ✅ compatible with 4.52.4
    # Removed evaluation_strategy (not available in this version)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

model.save_pretrained("../models/reply_generator")
tokenizer.save_pretrained("../models/reply_generator")
print("✅ T5 reply generator saved successfully to ../models/reply_generator")


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,2.5032
100,1.3284
150,0.7812
200,0.6241
250,0.5057
300,0.3806
350,0.3341
400,0.2725
450,0.2436
500,0.2342




Evaluation results: {'eval_loss': 0.0743793323636055, 'eval_runtime': 177.4686, 'eval_samples_per_second': 4.801, 'eval_steps_per_second': 0.304, 'epoch': 3.0}
✅ T5 reply generator saved successfully to ../models/reply_generator
