In [1]:
!pip install tokenizers sacrebleu rouge-score evaluate gradio


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting pydantic<2.12,>=2.0 (from gradio)
  Downloading pydantic-2.11.10-py3-none-any.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_

In [2]:
import pandas as pd
import re
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from sklearn.model_selection import train_test_split

# Load dataset
data_path = '/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv'
df = pd.read_csv(data_path, usecols=['Situation', 'emotion', 'empathetic_dialogues', 'labels'])
df = df.rename(columns={'Situation': 'situation', 'emotion': 'emotion', 'labels': 'agent_response'})
df = df[['situation', 'emotion', 'agent_response']]
df.dropna(subset=['situation', 'emotion', 'agent_response'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Normalize text
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Clean whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['situation'] = df['situation'].apply(normalize_text)
df['agent_response'] = df['agent_response'].apply(normalize_text)
df['emotion'] = df['emotion'].apply(lambda x: str(x).lower().strip())

# Split dataset (80/10/10)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Build BPE Tokenizer
def build_tokenizer(train_df):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    special_tokens = ["<pad>", "<bos>", "<eos>", "<unk>", "<sep>"] + [f"<emotion_{emo}>" for emo in train_df['emotion'].unique()]
    trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=special_tokens)
    texts = (train_df['situation'] + ' ' + train_df['agent_response']).tolist()
    tokenizer.train_from_iterator(texts, trainer=trainer)
    tokenizer.save("/kaggle/working/tokenizer.json")
    return tokenizer

tokenizer = build_tokenizer(train_df)

# Encode Input & Target
def prepare_input(row):
    input_text = f"Emotion: {row['emotion']} | Situation: {row['situation']} Agent:"
    return tokenizer.encode(f"<bos> {input_text}").ids

def prepare_target(row):
    target_text = row['agent_response']
    return tokenizer.encode(f"{target_text} <eos>").ids

for split_df in [train_df, val_df, test_df]:
    split_df['input_ids'] = split_df.apply(prepare_input, axis=1)
    split_df['target_ids'] = split_df.apply(prepare_target, axis=1)

# Save preprocessed datasets
train_df.to_csv("/kaggle/working/train_preprocessed.csv", index=False)
val_df.to_csv("/kaggle/working/val_preprocessed.csv", index=False)
test_df.to_csv("/kaggle/working/test_preprocessed.csv", index=False)






In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmpatheticDataset(Dataset):
    def __init__(self, df, max_len=128):
        self.inputs = df['input_ids'].tolist()
        self.targets = df['target_ids'].tolist()
        self.max_len = max_len
        self.pad_id = tokenizer.token_to_id("<pad>")

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        inp = self.inputs[idx][:self.max_len] + [self.pad_id] * max(0, self.max_len - len(self.inputs[idx]))
        tgt = self.targets[idx][:self.max_len] + [self.pad_id] * max(0, self.max_len - len(self.targets[idx]))
        return torch.tensor(inp, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

train_dataset = EmpatheticDataset(train_df)
val_dataset = EmpatheticDataset(val_df)
test_dataset = EmpatheticDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [5]:
import torch
import torch.nn as nn
import sacrebleu
import math

# ----------------------------
# Positional Encoding
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# ----------------------------
# Multi-Head Attention
# ----------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        q = self.q_linear(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        return self.out_linear(out)

# ----------------------------
# Feed Forward
# ----------------------------
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))

# ----------------------------
# Encoder Layer
# ----------------------------
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn))
        ff = self.ff(x)
        return self.norm2(x + self.dropout(ff))

# ----------------------------
# Decoder Layer
# ----------------------------
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.self_mha = MultiHeadAttention(d_model, num_heads)
        self.cross_mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        self_attn = self.self_mha(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn))
        cross_attn = self.cross_mha(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(cross_attn))
        ff = self.ff(x)
        return self.norm3(x + self.dropout(ff))

# ----------------------------
# Full Transformer Model
# ----------------------------
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=2, num_enc_layers=2, num_dec_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dropout) for _ in range(num_enc_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_dec_layers)])
        self.linear = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.dropout(self.pos_enc(self.embedding(src)))
        tgt_emb = self.dropout(self.pos_enc(self.embedding(tgt)))
        enc_out = src_emb
        for layer in self.encoder_layers:
            enc_out = layer(enc_out, src_mask)
        dec_out = tgt_emb
        for layer in self.decoder_layers:
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        return self.linear(dec_out)

# ----------------------------
# Helper Functions
# ----------------------------
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    return mask == 0

def create_padding_mask(seq, pad_id):
    return (seq != pad_id).unsqueeze(1).unsqueeze(2)

# ----------------------------
# Training Setup
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Placeholder for tokenizer - ensure this is defined elsewhere (e.g., preprocess.py)
# Example: tokenizer = some_loaded_tokenizer with methods get_vocab_size(), token_to_id(), decode()
try:
    model = Transformer(vocab_size=tokenizer.get_vocab_size()).to(device)
except NameError:
    raise NameError("Tokenizer not defined. Please run preprocess.py or define tokenizer with get_vocab_size(), token_to_id(), and decode() methods.")

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98))
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("<pad>"))

# Placeholder for data loaders - ensure these are defined elsewhere (e.g., preprocess.py)
# Example: train_loader, val_loader = some_data_loading_function()
if 'train_loader' not in globals() or 'val_loader' not in globals():
    raise NameError("train_loader and/or val_loader not defined. Please run preprocess.py or define them as DataLoader objects.")

# ----------------------------
# Training Loop
# ----------------------------
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        tgt_input = tgt[:, :-1]
        tgt_out = tgt[:, 1:]
        src_mask = create_padding_mask(src, tokenizer.token_to_id("<pad>")).to(device)
        tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device) & create_padding_mask(tgt_input, tokenizer.token_to_id("<pad>")).to(device)
        preds = model(src, tgt_input, src_mask, tgt_mask)
        loss = criterion(preds.view(-1, preds.size(-1)), tgt_out.reshape(-1))
        loss.backward()
        # Add gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# ----------------------------
# Greedy Decoding
# ----------------------------
def greedy_decode(model, src, max_len=50):
    model.eval()
    src = src.to(device)
    if len(src.shape) == 1:
        src = src.unsqueeze(0)
    batch_size = src.size(0)
    src_mask = create_padding_mask(src, tokenizer.token_to_id("<pad>")).to(device)
    enc_out = model.dropout(model.pos_enc(model.embedding(src)))
    for layer in model.encoder_layers:
        enc_out = layer(enc_out, src_mask)
    ys = torch.tensor([[tokenizer.token_to_id("<bos>")]] * batch_size, device=device)
    for _ in range(max_len):
        tgt_mask = generate_square_subsequent_mask(ys.size(1)).to(device)
        dec_out = model.dropout(model.pos_enc(model.embedding(ys)))
        for layer in model.decoder_layers:
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        pred = model.linear(dec_out[:, -1, :])
        next_token = pred.argmax(1)
        ys = torch.cat((ys, next_token.unsqueeze(1)), dim=1)
        if torch.all(next_token == tokenizer.token_to_id("<eos>")):
            break
    return ys[:, 1:].cpu().tolist()

# ----------------------------
# Compute BLEU
# ----------------------------
def compute_bleu(model, loader):
    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for src, tgt in loader:
            src = src.to(device)
            preds = greedy_decode(model, src)
            hyps.extend([tokenizer.decode(p).strip() for p in preds])
            refs.extend([tokenizer.decode(t[1:].tolist()).strip() for t in tgt])
    return sacrebleu.corpus_bleu(hyps, [[r] for r in refs]).score

# ----------------------------
# Full Training Loop
# ----------------------------
best_bleu = 0
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_bleu = compute_bleu(model, val_loader)
    if val_bleu > best_bleu:
        torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
        best_bleu = val_bleu
    print(f"Epoch {epoch+1}: Loss = {train_loss:.4f}, Val BLEU = {val_bleu:.2f}")

Epoch 1: Loss = 5.5120, Val BLEU = 64.68
Epoch 2: Loss = 4.8166, Val BLEU = 70.71
Epoch 3: Loss = 4.6245, Val BLEU = 75.98
Epoch 4: Loss = 4.5010, Val BLEU = 100.00
Epoch 5: Loss = 4.4085, Val BLEU = 100.00
Epoch 6: Loss = 4.3298, Val BLEU = 100.00
Epoch 7: Loss = 4.2636, Val BLEU = 100.00
Epoch 8: Loss = 4.2030, Val BLEU = 100.00
Epoch 9: Loss = 4.1476, Val BLEU = 79.53
Epoch 10: Loss = 4.0968, Val BLEU = 100.00


In [6]:
!pip install tokenizers sacrebleu rouge-score evaluate gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [7]:
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
!pip install tokenizers sacrebleu rouge-score evaluate gradio transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.1.2
  Downloading https://download.pytorch.org/whl/cu118/torch-2.1.2%2Bcu118-cp311-cp311-linux_x86_64.whl (2325.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m402.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision==0.16.2
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.16.2%2Bcu118-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting torchaudio==2.1.2
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.1.2%2Bcu118-cp311-cp311-linux_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting triton==2.1.0 (from torch==2.1.2)
  Downloading https://download.pytorch.org/whl/t

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [8]:
import pandas as pd
import re
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from sklearn.model_selection import train_test_split
import sacrebleu
from rouge_score import rouge_scorer
import numpy as np

# NOTE: This script assumes the first two code blocks from your original prompt
# (data preprocessing and model architecture) have been run. It defines the
# evaluation components.

# ----------------------------
# Helper Functions (from original code)
# ----------------------------
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz, device=device) * float('-inf'), diagonal=1)
    return mask == 0

def create_padding_mask(seq, pad_id):
    return (seq != pad_id).unsqueeze(1).unsqueeze(2)

# --- Ensure Tokenizer, Model, and DataLoaders are loaded ---
# In a real script, you would load these from files. For this example,
# we'll assume they are available from the previous steps.
# For reproducibility, let's redefine them briefly.

# Dummy definitions for demonstration if not run in sequence
try:
    tokenizer = Tokenizer.from_file("/kaggle/working/tokenizer.json")
    pad_id = tokenizer.token_to_id("<pad>")
    bos_id = tokenizer.token_to_id("<bos>")
    eos_id = tokenizer.token_to_id("<eos>")
except (NameError, FileNotFoundError):
    print("Tokenizer not found. Please run the preprocessing script first.")
    exit()

# Assume 'Transformer' class definition exists from the original code
# Assume 'test_loader' is defined from the original code

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(vocab_size=tokenizer.get_vocab_size()).to(device)
try:
    model.load_state_dict(torch.load('/kaggle/working/best_model.pt', map_location=device))
    print("Model loaded successfully.")
except FileNotFoundError:
    print("Model weights not found. Evaluation will be on an untrained model.")

# -------------------------------------
# 5. EVALUATION IMPLEMENTATION
# -------------------------------------

## Automatic Metrics Calculation

def greedy_decode(model, src, max_len=50):
    """Generates text for a given source tensor."""
    model.eval()
    src = src.to(device)
    if src.dim() == 1:
        src = src.unsqueeze(0)
    batch_size = src.size(0)

    # Re-using encoder logic from original training code for consistency
    src_mask = create_padding_mask(src, pad_id).to(device)
    with torch.no_grad():
        enc_out = model.dropout(model.pos_enc(model.embedding(src)))
        for layer in model.encoder_layers:
            enc_out = layer(enc_out, src_mask)

    ys = torch.full((batch_size, 1), bos_id, dtype=torch.long, device=device)

    for _ in range(max_len - 1):
        with torch.no_grad():
            tgt_mask = generate_square_subsequent_mask(ys.size(1)).to(device)
            dec_out = model.dropout(model.pos_enc(model.embedding(ys)))
            for layer in model.decoder_layers:
                dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
            pred = model.linear(dec_out[:, -1, :])
            next_token = pred.argmax(dim=-1)

        # Stop if all sequences in batch have generated <eos>
        if torch.all(next_token == eos_id).item():
            break
            
        ys = torch.cat([ys, next_token.unsqueeze(1)], dim=1)

    return ys.cpu().tolist()


def calculate_metrics(model, loader):
    """Calculates BLEU, ROUGE-L, chrF, and Perplexity."""
    model.eval()
    
    # For automatic metrics
    hypotheses = [] # Model predictions
    references = [] # Ground truth
    
    # For perplexity
    total_loss = 0
    total_tokens = 0
    
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_f1 = []

    criterion = nn.CrossEntropyLoss(ignore_index=pad_id, reduction='sum')

    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            
            # --- Perplexity Calculation ---
            tgt_input = tgt[:, :-1]
            tgt_out = tgt[:, 1:]
            src_mask = create_padding_mask(src, pad_id)
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)) & create_padding_mask(tgt_input, pad_id)
            
            preds = model(src, tgt_input, src_mask, tgt_mask)
            loss = criterion(preds.view(-1, preds.size(-1)), tgt_out.reshape(-1))
            
            # Count non-padded tokens for accurate PPL
            non_pad_tokens = (tgt_out != pad_id).sum().item()
            total_loss += loss.item()
            total_tokens += non_pad_tokens

            # --- BLEU, ROUGE, chrF Calculation ---
            generated_ids = greedy_decode(model, src)
            
            # Decode for metric calculation
            batch_hyps = [tokenizer.decode(ids).strip() for ids in generated_ids]
            batch_refs = [tokenizer.decode(ids).strip() for ids in tgt.cpu().tolist()]

            hypotheses.extend(batch_hyps)
            references.extend(batch_refs)

    # --- Final Metric Scores ---
    bleu = sacrebleu.corpus_bleu(hypotheses, [[r] for r in references])
    chrf = sacrebleu.corpus_chrf(hypotheses, [[r] for r in references])
    
    # Calculate ROUGE-L per sentence and average
    for hyp, ref in zip(hypotheses, references):
        scores = scorer.score(ref, hyp)
        rouge_l_f1.append(scores['rougeL'].fmeasure)
    
    avg_rouge_l = np.mean(rouge_l_f1) * 100
    
    # Calculate perplexity
    perplexity = math.exp(total_loss / total_tokens) if total_tokens > 0 else float('inf')

    return {
        "BLEU": bleu.score,
        "ROUGE-L": avg_rouge_l,
        "chrF": chrf.score,
        "Perplexity": perplexity
    }, hypotheses, references

## Human Evaluation Framework

def human_evaluation_interface(situation, ground_truth, model_output):
    """Simulates a human evaluation task for a single example."""
    print("\n--- Human Evaluation Example ---")
    print(f"Situation: {situation}")
    print(f"Ground Truth Response: {ground_truth}")
    print(f"Model Generated Response: {model_output}")
    print("-" * 20)
    
    try:
        fluency = int(input("Rate Fluency (1-5): "))
        relevance = int(input("Rate Relevance (1-5): "))
        adequacy = int(input("Rate Adequacy/Empathy (1-5): "))
        return {"fluency": fluency, "relevance": relevance, "adequacy": adequacy}
    except (ValueError, EOFError):
        print("Invalid input. Skipping.")
        return None

## Qualitative Examples

def show_qualitative_examples(test_df, hyps, refs, num_examples=5):
    """Prints a comparison of model output and ground truth."""
    print("\n--- Qualitative Examples ---")
    
    # We need the original text, so we'll use the test dataframe
    samples = test_df.head(num_examples)
    
    for i, (idx, row) in enumerate(samples.iterrows()):
        print(f"\n--- Example {i+1} ---")
        print(f"Emotion: {row['emotion']}")
        print(f"Situation: {row['situation']}")
        print(f"✅ Ground Truth: {refs[i]}")
        print(f"🤖 Model Output: {hyps[i]}")
        print("-" * 20)
        
# --- Main Evaluation Execution ---
if __name__ == "__main__":
    # Load the test dataframe to get original text for examples
    try:
        test_df = pd.read_csv("/kaggle/working/test_preprocessed.csv")
    except FileNotFoundError:
        print("Preprocessed test CSV not found. Cannot show qualitative examples.")
        test_df = None

    print("Running evaluation on the test set...")
    metrics, hyps, refs = calculate_metrics(model, test_loader)
    
    print("\n--- Automatic Metrics ---")
    print(f"📊 Perplexity: {metrics['Perplexity']:.2f}")
    print(f"📊 BLEU Score: {metrics['BLEU']:.2f}")
    print(f"📊 ROUGE-L (F1): {metrics['ROUGE-L']:.2f}")
    print(f"📊 chrF Score: {metrics['chrF']:.2f}")
    
    if test_df is not None:
        show_qualitative_examples(test_df, hyps, refs)
    
        # Run a few examples through the human evaluation interface
        print("\nStarting interactive human evaluation for the first 3 examples...")
        all_human_scores = []
        for i in range(3):
            scores = human_evaluation_interface(
                situation=test_df.iloc[i]['situation'],
                ground_truth=refs[i],
                model_output=hyps[i]
            )
            if scores:
                all_human_scores.append(scores)
        
        if all_human_scores:
            avg_fluency = np.mean([s['fluency'] for s in all_human_scores])
            avg_relevance = np.mean([s['relevance'] for s in all_human_scores])
            avg_adequacy = np.mean([s['adequacy'] for s in all_human_scores])
            print("\n--- Average Human Scores (from your ratings) ---")
            print(f"⭐ Fluency: {avg_fluency:.2f}")
            print(f"⭐ Relevance: {avg_relevance:.2f}")
            print(f"⭐ Adequacy/Empathy: {avg_adequacy:.2f}")

Model loaded successfully.
Running evaluation on the test set...

--- Automatic Metrics ---
📊 Perplexity: 99.25
📊 BLEU Score: 88.01
📊 ROUGE-L (F1): 8.98
📊 chrF Score: 73.12

--- Qualitative Examples ---

--- Example 1 ---
Emotion: trusting
Situation: i sent a parcel to my cousin recently and it never arrived it cost me 50
✅ Ground Truth: thank you for your kind words and wishes
🤖 Model Output: you must be very proud of your family
--------------------

--- Example 2 ---
Emotion: terrified
Situation: someone has knocked on my door in the middle of the night the past two nights it is kind startling and scary
✅ Ground Truth: did they knock and leave
🤖 Model Output: you have to do that
--------------------

--- Example 3 ---
Emotion: jealous
Situation: im jealous with people that have gyms in their homes i would work out every day if i had one
✅ Ground Truth: yes youtube will def have some awesome ideas i could stand to get rid of this gut also hahahah
🤖 Model Output: you can do that
-----

Rate Fluency (1-5):  1
Rate Relevance (1-5):  5
Rate Adequacy/Empathy (1-5):  3



--- Human Evaluation Example ---
Situation: someone has knocked on my door in the middle of the night the past two nights it is kind startling and scary
Ground Truth Response: did they knock and leave
Model Generated Response: you have to do that
--------------------


Rate Fluency (1-5):  2
Rate Relevance (1-5):  4
Rate Adequacy/Empathy (1-5):  3



--- Human Evaluation Example ---
Situation: im jealous with people that have gyms in their homes i would work out every day if i had one
Ground Truth Response: yes youtube will def have some awesome ideas i could stand to get rid of this gut also hahahah
Model Generated Response: you can do that
--------------------


Rate Fluency (1-5):  5
Rate Relevance (1-5):  4
Rate Adequacy/Empathy (1-5):  3



--- Average Human Scores (from your ratings) ---
⭐ Fluency: 2.67
⭐ Relevance: 4.33
⭐ Adequacy/Empathy: 3.00


In [9]:
from IPython.display import FileLink

# This creates a link to the file in the output of this cell
display(FileLink('best_model.pt'))

In [14]:
pip install streamlit torch tokenizers sacrebleu pandas numpy matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
import streamlit as st
import torch
import torch.nn as nn
import math
import re
from tokenizers import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# --- 1. MODEL ARCHITECTURE DEFINITION ---
# This section contains the PyTorch model classes, copied from the training script.
# It's necessary to define the architecture before loading the saved model weights.

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        self.attention_weights = None

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        q = self.q_linear(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        self.attention_weights = attn # Store attention weights
        out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        return self.out_linear(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn))
        ff = self.ff(x)
        return self.norm2(x + self.dropout(ff))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.self_mha = MultiHeadAttention(d_model, num_heads)
        self.cross_mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        self_attn = self.self_mha(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn))
        cross_attn = self.cross_mha(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(cross_attn))
        ff = self.ff(x)
        return self.norm3(x + self.dropout(ff))

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=2, num_enc_layers=2, num_dec_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dropout) for _ in range(num_enc_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_dec_layers)])
        self.linear = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.dropout(self.pos_enc(self.embedding(src)))
        tgt_emb = self.dropout(self.pos_enc(self.embedding(tgt)))
        enc_out = src_emb
        for layer in self.encoder_layers:
            enc_out = layer(enc_out, src_mask)
        dec_out = tgt_emb
        for i, layer in enumerate(self.decoder_layers):
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        return self.linear(dec_out)

# --- 2. SETUP AND UTILITY FUNCTIONS ---

# Use @st.cache_resource to load model and tokenizer only once
@st.cache_resource
def load_model_and_tokenizer():
    """Loads the trained Transformer model and tokenizer."""
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = Tokenizer.from_file("tokenizer.json")
        
        # Instantiate model with parameters from training
        vocab_size = tokenizer.get_vocab_size()
        model = Transformer(vocab_size=vocab_size, d_model=256, num_heads=2, num_enc_layers=2, num_dec_layers=2)
        
        # Load the saved state dictionary
        model.load_state_dict(torch.load('best_model.pt', map_location=device))
        model.to(device)
        model.eval()
        return model, tokenizer, device
    except FileNotFoundError:
        st.error("Model or tokenizer file not found. Please ensure 'best_model.pt' and 'tokenizer.json' are in the same directory.")
        return None, None, None

def normalize_text(text):
    """Cleans and standardizes text."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def create_masks(src, tgt, pad_id, device):
    """Creates masks for the Transformer model."""
    src_mask = (src != pad_id).unsqueeze(1).unsqueeze(2).to(device)
    tgt_len = tgt.size(1)
    tgt_mask = (torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1) == 0).to(device)
    tgt_pad_mask = (tgt != pad_id).unsqueeze(1).unsqueeze(2).to(device)
    tgt_mask = tgt_mask & tgt_pad_mask
    return src_mask, tgt_mask

# --- 3. DECODING STRATEGIES ---

def greedy_decode(model, src, max_len=50, bos_id=2, eos_id=3, device='cpu'):
    """Greedy decoding: selects the most likely token at each step."""
    src = src.to(device)
    src_mask = (src != model.embedding.padding_idx).unsqueeze(1).unsqueeze(2).to(device) if model.embedding.padding_idx is not None else None

    with torch.no_grad():
        src_emb = model.dropout(model.pos_enc(model.embedding(src)))
        enc_out = src_emb
        for layer in model.encoder_layers:
            enc_out = layer(enc_out, src_mask)
        
        ys = torch.ones(1, 1).fill_(bos_id).type(torch.long).to(device)
        for _ in range(max_len - 1):
            tgt_mask = (torch.triu(torch.ones(ys.size(1), ys.size(1)), diagonal=1) == 0).to(device)
            out = model(src, ys, src_mask, tgt_mask)
            prob = out[:, -1]
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.item()
            ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
            if next_word == eos_id:
                break
    return ys

def beam_search_decode(model, src, max_len=50, beam_width=5, bos_id=2, eos_id=3, device='cpu'):
    """Beam search decoding: keeps track of k most likely sequences."""
    src = src.to(device)
    src_mask = (src != model.embedding.padding_idx).unsqueeze(1).unsqueeze(2).to(device) if model.embedding.padding_idx is not None else None

    with torch.no_grad():
        src_emb = model.dropout(model.pos_enc(model.embedding(src)))
        enc_out = src_emb
        for layer in model.encoder_layers:
            enc_out = layer(enc_out, src_mask)

        # Start with <bos> token
        sequences = [[torch.tensor([bos_id], device=device), 0.0]]

        for _ in range(max_len):
            all_candidates = []
            for seq, score in sequences:
                if seq[-1].item() == eos_id:
                    all_candidates.append([seq, score])
                    continue
                
                tgt_mask = (torch.triu(torch.ones(seq.size(0), seq.size(0)), diagonal=1) == 0).to(device)
                out = model(src, seq.unsqueeze(0), src_mask, tgt_mask)
                prob = torch.log_softmax(out[:, -1], dim=-1)
                
                topk_scores, topk_words = prob.topk(beam_width, dim=-1)

                for i in range(beam_width):
                    next_tok, next_score = topk_words[0][i], topk_scores[0][i]
                    new_seq = torch.cat([seq, next_tok.unsqueeze(0)])
                    new_score = score + next_score.item()
                    all_candidates.append([new_seq, new_score])
            
            ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
            sequences = ordered[:beam_width]
            
            # Stop if all top sequences end with <eos>
            if all(s[0][-1].item() == eos_id for s in sequences):
                break

    return sequences[0][0].unsqueeze(0)


# --- 4. ATTENTION VISUALIZATION ---

def get_attention_weights(model, src, generated_seq, pad_id, device):
    """Performs a forward pass to capture attention weights."""
    model.eval()
    with torch.no_grad():
        src_mask, tgt_mask = create_masks(src, generated_seq, pad_id, device)
        
        src_emb = model.dropout(model.pos_enc(model.embedding(src)))
        tgt_emb = model.dropout(model.pos_enc(model.embedding(generated_seq)))
        
        enc_out = src_emb
        for layer in model.encoder_layers:
            enc_out = layer(enc_out, src_mask)
            
        dec_out = tgt_emb
        # We want the cross-attention from the LAST decoder layer
        for i, layer in enumerate(model.decoder_layers):
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        
        # Access the stored weights from the last decoder's cross-attention module
        attention = model.decoder_layers[-1].cross_mha.attention_weights
        return attention

def plot_attention_heatmap(weights, src_tokens, tgt_tokens):
    """Plots and displays the attention heatmap."""
    fig, ax = plt.subplots(figsize=(10, 10))
    # Squeeze to remove batch and head dimensions, then average over heads
    weights = weights.squeeze(0).cpu().numpy()
    if weights.ndim > 2:
        weights = weights.mean(axis=0) # Average over heads

    cax = ax.matshow(weights, cmap='bone')
    fig.colorbar(cax)

    ax.set_xticklabels([''] + src_tokens, rotation=90)
    ax.set_yticklabels([''] + tgt_tokens)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    st.pyplot(fig)


# --- 5. STREAMLIT UI ---

st.set_page_config(layout="wide")
st.title("🤖 Empathetic Chatbot")
st.markdown("An interface to interact with a Transformer model trained on the Empathetic Dialogues dataset.")

model, tokenizer, device = load_model_and_tokenizer()

if model is not None:
    # Get available emotions from the tokenizer's special tokens
    emotions = [
        tok.replace("<emotion_", "").replace(">", "") 
        for tok in tokenizer.get_vocab().keys() if tok.startswith("<emotion_")
    ]
    emotions = sorted(list(set(emotions))) # Get unique sorted list

    # Initialize session state for conversation history
    if 'history' not in st.session_state:
        st.session_state['history'] = []

    # Sidebar for options
    with st.sidebar:
        st.header("Inference Options")
        selected_emotion = st.selectbox("Select an Emotion (optional)", ["none"] + emotions)
        decoding_strategy = st.radio("Decoding Strategy", ["Greedy Search", "Beam Search"])
        
        beam_width = 5
        if decoding_strategy == "Beam Search":
            beam_width = st.slider("Beam Width", min_value=2, max_value=10, value=5)
        
        show_attention = st.checkbox("Show Attention Heatmap")

    # Main chat interface
    for message in st.session_state.history:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # User input
    if prompt := st.chat_input("How are you feeling today?"):
        # Add user message to history
        st.session_state.history.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)

        # Prepare model input
        cleaned_prompt = normalize_text(prompt)
        if selected_emotion != "none":
            input_text = f"Emotion: {selected_emotion} | Situation: {cleaned_prompt} Agent:"
        else:
            input_text = f"Situation: {cleaned_prompt} Agent:"
        
        input_ids = tokenizer.encode(f"<bos> {input_text}").ids
        src = torch.tensor([input_ids], device=device)

        # Generate response based on selected strategy
        with st.spinner("Thinking..."):
            if decoding_strategy == "Greedy Search":
                output_ids = greedy_decode(model, src, bos_id=tokenizer.token_to_id("<bos>"), eos_id=tokenizer.token_to_id("<eos>"), device=device)
            else: # Beam Search
                output_ids = beam_search_decode(model, src, beam_width=beam_width, bos_id=tokenizer.token_to_id("<bos>"), eos_id=tokenizer.token_to_id("<eos>"), device=device)

        # Decode and display response
        response_text = tokenizer.decode(output_ids.squeeze(0).tolist(), skip_special_tokens=True).strip()
        st.session_state.history.append({"role": "assistant", "content": response_text})
        with st.chat_message("assistant"):
            st.markdown(response_text)

            # Display attention heatmap if requested
            if show_attention:
                with st.expander("See Attention Weights"):
                    src_tokens = tokenizer.encode(f"<bos> {input_text}").tokens
                    tgt_tokens = tokenizer.decode(output_ids.squeeze(0).tolist()).split()
                    
                    attention_weights = get_attention_weights(model, src, output_ids, tokenizer.token_to_id("<pad>"), device)
                    plot_attention_heatmap(attention_weights, src_tokens, tgt_tokens)


2025-10-17 11:04:38.937 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-17 11:04:39.313 Session state does not function when running a script without `streamlit run`


In [13]:
import gradio as gr
import torch
from your_preprocessing_file import normalize_text  # Adjust import path

def beam_search_decode(model, src, beam_width=3, max_len=50):
    model.eval()
    src = src.to(device)
    if len(src.shape) == 1:
        src = src.unsqueeze(0)
    batch_size = src.size(0)
    src_mask = create_padding_mask(src, tokenizer.token_to_id("<pad>")).to(device)
    enc_out = model.dropout(model.pos_enc(model.embedding(src)))
    for layer in model.encoder_layers:
        enc_out = layer(enc_out, src_mask)
    ys = torch.tensor([[tokenizer.token_to_id("<bos>")]] * batch_size, device=device).repeat(beam_width, 1)
    scores = torch.zeros(beam_width, device=device)
    end_ids = [tokenizer.token_to_id("<eos>")]
    for _ in range(max_len):
        tgt_mask = generate_square_subsequent_mask(ys.size(1)).to(device)
        dec_out = model.dropout(model.pos_enc(model.embedding(ys)))
        for layer in model.decoder_layers:
            dec_out = layer(dec_out, enc_out, src_mask, tgt_mask)
        pred = model.linear(dec_out[:, -1, :])
        log_probs = torch.log_softmax(pred, dim=-1)
        vocab_size = log_probs.size(-1)
        log_probs = log_probs.view(beam_width, -1)
        scores = scores.unsqueeze(-1) + log_probs
        scores, indices = scores.view(-1).topk(beam_width, dim=0)
        beam_ids = indices // vocab_size
        token_ids = indices % vocab_size
        ys = torch.cat((ys[beam_ids], token_ids.unsqueeze(-1)), dim=1)
        if torch.all(torch.isin(ys[:, -1], end_ids)):
            break
    best_idx = scores.argmax()
    return ys[best_idx].tolist()

def chat(emotion, situation, history, decode_strategy='greedy'):
    input_text = f"Emotion: {emotion} | Situation: {situation} Agent:" if emotion else f"Situation: {situation} Agent:"
    input_text = normalize_text(input_text)
    input_ids = torch.tensor(tokenizer.encode(f"<bos> {input_text}").ids, dtype=torch.long)
    if decode_strategy == 'greedy':
        output_ids = greedy_decode(model, input_ids)
    else:  # Beam search
        output_ids = beam_search_decode(model, input_ids)
    reply = tokenizer.decode(output_ids).strip()
    history.append((situation, reply))
    return history

with gr.Blocks() as demo:
    gr.Markdown("Empathetic Chatbot")
    emotion = gr.Textbox(label="Emotion (optional)", value="")
    situation = gr.Textbox(label="Situation")
    chatbot = gr.Chatbot()
    decode = gr.Radio(["greedy", "beam"], label="Decoding Strategy", value="greedy")
    submit = gr.Button("Submit")
    submit.click(chat, [emotion, situation, chatbot, decode], [chatbot])
    demo.launch(share=True)

ModuleNotFoundError: No module named 'your_preprocessing_file'