In [None]:
# Friendly Dataset Dialogue Act Classification using BERT
# Complete training pipeline for chatbot dialogue act recognition

import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("="*60)
print("FRIENDLY DATASET DIALOGUE ACT CLASSIFICATION")
print("="*60)

# 1. Load dataset from CSV files
print("\n1. Loading Dataset...")
dataset = load_dataset("csv", data_files={
    "train": r"C:\Users\NAMAN\Documents\GitHub\Prototype-\Friendly Dataset\train.csv",
    "validation": r"C:\Users\NAMAN\Documents\GitHub\Prototype-\Friendly Dataset\validation.csv",
    "test": r"C:\Users\NAMAN\Documents\GitHub\Prototype-\Friendly Dataset\test.csv"
})

print("Dataset loaded successfully!")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

# Explore dataset structure
print("\nDataset structure:")
print("Train columns:", dataset['train'].column_names)
print("Sample data:")
for i in range(3):
    sample = dataset['train'][i]
    print(f"Dialog: {sample['dialog']}")
    print(f"Act: {sample['act']}")
    print("---")

# 2. Preprocess dialog data
print("\n2. Preprocessing Dialog Data...")
def preprocess(batch):
    # Join list of utterances if necessary
    if isinstance(batch["dialog"][0], list):
        batch["dialog"] = [" ".join(conv) for conv in batch["dialog"]]
    return batch

dataset = dataset.map(preprocess, batched=True)
print("Dialog preprocessing completed!")

# 3. Label encoding - FIXED VERSION
print("\n3. Label Encoding...")
from sklearn.preprocessing import LabelEncoder

# Create label encoder and fit on ALL data splits
label_encoder = LabelEncoder()

# Collect all acts from all splits
all_acts = []
for split in ['train', 'validation', 'test']:
    all_acts.extend(dataset[split]['act'])

# Fit label encoder on all unique acts
label_encoder.fit(all_acts)

print(f"Unique dialogue acts found: {len(label_encoder.classes_)}")
print("Label mapping:")
for i, act in enumerate(label_encoder.classes_):
    print(f"{i}: {act}")

# Apply label encoding to each split
def encode_labels(example):
    return {"act_label": label_encoder.transform([example["act"]])[0]}

dataset = dataset.map(encode_labels)
print("Label encoding completed!")

# 4. Initialize tokenizer from local path
print("\n4. Loading Tokenizer...")
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\NAMAN\Documents\GitHub\Prototype-\bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["dialog"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

dataset = dataset.map(tokenize, batched=True)
print("Tokenization completed!")

# 5. Set format and create datasets
print("\n5. Creating PyTorch Datasets...")
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "act_label"]
)

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

# 6. Custom collate function - FIXED VERSION
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    # Ensure labels are long integers and use correct key name
    labels = torch.tensor([int(item["act_label"]) for item in batch], dtype=torch.long)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels  # Changed from "act_label" to "labels"
    }

# 7. Create DataLoaders
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

print(f"DataLoaders created with batch size: {batch_size}")

# 8. Initialize model from local path
print("\n6. Loading BERT Model...")
num_labels = len(label_encoder.classes_)

model = BertForSequenceClassification.from_pretrained(
    r"C:\Users\NAMAN\Documents\GitHub\Prototype-\bert-base-uncased",
    num_labels=num_labels
)
model.to(device)
print(f"Model loaded with {num_labels} output classes")

# 9. Setup optimizer and scheduler
num_epochs = 3
learning_rate = 5e-5

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * num_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(f"Training setup - Epochs: {num_epochs}, Learning rate: {learning_rate}")

# 10. Training functions
def train_epoch(model, train_loader, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)  # Using correct key "labels"
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    return total_loss / len(train_loader)

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)  # Using correct key "labels"
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(val_loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    
    return avg_loss, accuracy, all_predictions, all_labels

# 11. Training loop
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50)

best_accuracy = 0
best_model_state = None
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 30)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
    train_losses.append(train_loss)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validate
    val_loss, val_accuracy, val_predictions, val_labels = evaluate(model, val_loader, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Save best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model_state = model.state_dict().copy()
        print(f"✅ New best model saved with accuracy: {best_accuracy:.4f}")

# 12. Load best model and final evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\n📥 Loaded best model with validation accuracy: {best_accuracy:.4f}")

print("\n" + "="*50)
print("FINAL EVALUATION ON TEST SET")
print("="*50)

test_loss, test_accuracy, test_predictions, test_labels = evaluate(model, test_loader, device)

print(f"\nFinal Test Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# 13. Detailed classification report
print("\n" + "="*50)
print("DETAILED CLASSIFICATION REPORT")
print("="*50)
target_names = label_encoder.classes_
print(classification_report(test_labels, test_predictions, target_names=target_names))

# 14. Confusion Matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(test_labels, test_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Dialogue Act Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# 15. Training curves
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(val_accuracies, label='Validation Accuracy', color='green')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 3)
class_counts = pd.Series(test_labels).value_counts().sort_index()
plt.bar(range(len(class_counts)), class_counts.values)
plt.title('Test Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(range(len(target_names)), target_names, rotation=45)

plt.tight_layout()
plt.show()

# 16. Save the trained model
model_save_path = r"C:\Users\NAMAN\Documents\GitHub\Prototype-\trained_friendly_bert"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save label encoder
with open(model_save_path + "/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# Save training history
training_history = {
    'train_losses': train_losses,
    'val_losses': val_losses,
    'val_accuracies': val_accuracies,
    'best_accuracy': best_accuracy,
    'test_accuracy': test_accuracy,
    'label_classes': label_encoder.classes_.tolist()
}

with open(model_save_path + "/training_history.pkl", "wb") as f:
    pickle.dump(training_history, f)

print(f"\n✅ Model saved to: {model_save_path}")
print(f"📊 Training history saved")
print(f"🏷️ Label encoder saved")

# 17. Inference function for dialogue act prediction
def predict_dialogue_act(text, model, tokenizer, label_encoder, device, max_length=64):
    """
    Predict dialogue act for a given text
    """
    model.eval()
    
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    # Decode prediction
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'predicted_act': predicted_label,
        'confidence': confidence,
        'all_probabilities': {act: prob.item() for act, prob in zip(label_encoder.classes_, predictions[0])}
    }

print("\n" + "="*50)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("="*50)
print(f"🎯 Best Validation Accuracy: {best_accuracy:.4f}")
print(f"🧪 Final Test Accuracy: {test_accuracy:.4f}")
print(f"💾 Model saved for inference")

# Example usage of the inference function
print("\n" + "="*40)
print("EXAMPLE DIALOGUE ACT PREDICTIONS")
print("="*40)

example_texts = [
    "Hello, how are you doing today?",
    "Thank you so much for your help!",
    "Can you please help me with this problem?",
    "I don't understand what you mean",
    "That sounds like a great idea!",
    "Sorry, I made a mistake there",
    "What time is it right now?",
    "Have a wonderful day!"
]

for text in example_texts:
    result = predict_dialogue_act(text, model, tokenizer, label_encoder, device)
    print(f"\nText: '{text}'")
    print(f"Dialogue Act: {result['predicted_act']} (Confidence: {result['confidence']:.3f})")

print("\n" + "="*60)
print("DIALOGUE ACT CLASSIFIER READY FOR CHATBOT INTEGRATION!")
print("="*60)