In [51]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import re
import math
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
DATA_PERCENTAGE = 100

df = pd.read_csv("/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv")

def normalize_text(text):
    if pd.isnull(text): return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'\s([?.!,;:])', r'\1', text)
    text = re.sub(r'([?.!,;:])(?=\w)', r'\1 ', text)
    return text

for col in ['Situation', 'emotion', 'empathetic_dialogues', 'labels']:
    df[col] = df[col].apply(normalize_text)

emotion_counts = df['emotion'].value_counts()
valid_emotions = emotion_counts[emotion_counts >= 50].index
df = df[df['emotion'].isin(valid_emotions)]

if DATA_PERCENTAGE < 100:
    df = df.sample(frac=DATA_PERCENTAGE/100, random_state=42).reset_index(drop=True)
    print(f'Using {DATA_PERCENTAGE}% of data: {len(df)} samples')

from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f'Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}')

Train: 51672, Val: 6459, Test: 6460


In [53]:
train_text = train_df[['Situation', 'emotion', 'empathetic_dialogues', 'labels']].fillna('').agg(' '.join, axis=1)
tokens = []
for text in train_text:
    tokens.extend(word_tokenize(text))

counter = Counter(tokens)
SPECIAL = ['<pad>', '<bos>', '<eos>', '<unk>', '<sep>']
EMOTIONS = [f'<emotion_{e}>' for e in sorted(train_df['emotion'].unique())]
vocab_list = SPECIAL + EMOTIONS + [w for w, _ in counter.most_common()]
vocab = {w: i for i, w in enumerate(vocab_list)}
inv_vocab = {i: w for w, i in vocab.items()}

def encode(text, add_bos_eos=True):
    tokens = word_tokenize(text)
    if add_bos_eos:
        tokens = ['<bos>'] + tokens + ['<eos>']
    return [vocab.get(t, vocab['<unk>']) for t in tokens]

def decode(ids, remove_special=True):
    words = [inv_vocab.get(i, '<unk>') for i in ids]
    if remove_special:
        words = [w for w in words if w not in SPECIAL and not w.startswith('<emotion_')]
    return ' '.join(words)

print(f'Vocab size: {len(vocab)}')

Vocab size: 20195


In [54]:
def format_input(row):
    return f"Emotion: {row['emotion']} | Situation: {row['Situation']} | Customer: {row['empathetic_dialogues']} Agent:"

def format_target(row):
    return row['labels']

class ChatDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        x = encode(format_input(row))
        y = encode(format_target(row))
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def collate_fn(batch):
    xs, ys = zip(*batch)
    xs_p = pad_sequence(xs, batch_first=True, padding_value=vocab['<pad>'])
    ys_p = pad_sequence(ys, batch_first=True, padding_value=vocab['<pad>'])
    return xs_p, ys_p

BATCH_SIZE = 64
train_loader = DataLoader(ChatDataset(train_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ChatDataset(val_df), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(ChatDataset(test_df), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [55]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, k, v, mask=None):
        B = q.size(0)
        q = self.q_linear(q).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-1e9'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(B, -1, self.num_heads * self.d_k)
        return self.out(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        x = self.norm1(x + self.dropout(self.attn(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_out, enc_out, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, num_heads=2, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
    
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.pos_enc(self.embedding(src) * math.sqrt(self.d_model))
        tgt = self.pos_enc(self.embedding(tgt) * math.sqrt(self.d_model))
        
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        
        return self.out(tgt)

model = Transformer(len(vocab), d_model=512, num_heads=2, num_layers=2, dropout=0.1).to(device)
print(f'Model parameters: {sum(p.numel() for p in model.parameters()):,}')

Model parameters: 35,412,707


In [56]:
def make_causal_mask(size):
    mask = torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(0)
    return mask

def make_padding_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(1).unsqueeze(2)

criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98))

In [57]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]
        
        tgt_mask = make_causal_mask(tgt_input.size(1)).to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt_input, tgt_mask=tgt_mask)
        
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_target.reshape(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)

In [58]:
def greedy_decode(model, src, max_len=50):
    model.eval()
    src = src.to(device)
    
    with torch.no_grad():
        enc = model.pos_enc(model.embedding(src) * math.sqrt(model.d_model))
        for layer in model.encoder_layers:
            enc = layer(enc)
        
        ys = torch.tensor([[vocab['<bos>']]], device=device)
        for _ in range(max_len):
            tgt_mask = make_causal_mask(ys.size(1)).to(device)
            tgt_emb = model.pos_enc(model.embedding(ys) * math.sqrt(model.d_model))
            
            for layer in model.decoder_layers:
                tgt_emb = layer(tgt_emb, enc, tgt_mask=tgt_mask)
            
            logits = model.out(tgt_emb[:, -1, :])
            next_token = logits.argmax(dim=-1).unsqueeze(0)
            ys = torch.cat([ys, next_token], dim=1)
            
            if next_token.item() == vocab['<eos>']:
                break
    
    return ys.squeeze(0).tolist()

def beam_search(model, src, beam_width=3, max_len=50):
    model.eval()
    src = src.to(device)
    
    with torch.no_grad():
        enc = model.pos_enc(model.embedding(src) * math.sqrt(model.d_model))
        for layer in model.encoder_layers:
            enc = layer(enc)
        
        beams = [(torch.tensor([[vocab['<bos>']]], device=device), 0.0)]
        
        for _ in range(max_len):
            new_beams = []
            for seq, score in beams:
                if seq[0, -1].item() == vocab['<eos>']:
                    new_beams.append((seq, score))
                    continue
                
                tgt_mask = make_causal_mask(seq.size(1)).to(device)
                tgt_emb = model.pos_enc(model.embedding(seq) * math.sqrt(model.d_model))
                
                for layer in model.decoder_layers:
                    tgt_emb = layer(tgt_emb, enc, tgt_mask=tgt_mask)
                
                logits = model.out(tgt_emb[:, -1, :])
                log_probs = torch.log_softmax(logits, dim=-1)
                top_probs, top_indices = log_probs.topk(beam_width)
                
                for i in range(beam_width):
                    next_token = top_indices[0, i].unsqueeze(0).unsqueeze(0)
                    next_score = score + top_probs[0, i].item()
                    next_seq = torch.cat([seq, next_token], dim=1)
                    new_beams.append((next_seq, next_score))
            
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            
            if all(seq[0, -1].item() == vocab['<eos>'] for seq, _ in beams):
                break
    
    return beams[0][0].squeeze(0).tolist()

In [59]:
!pip install sacrebleu rouge-score -q

In [60]:
import sacrebleu
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def evaluate(model, loader, max_samples=None, decode_method='greedy', beam_width=3):
    model.eval()
    refs, hyps = [], []
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for i, (src, tgt) in enumerate(loader):
            if max_samples is not None and i * BATCH_SIZE >= max_samples:
                break
            
            src, tgt = src.to(device), tgt.to(device)
            
            for b in range(src.size(0)):
                if decode_method == 'beam':
                    pred_ids = beam_search(model, src[b:b+1], beam_width=beam_width, max_len=50)
                else:
                    pred_ids = greedy_decode(model, src[b:b+1], max_len=50)
                
                ref_ids = tgt[b].cpu().tolist()
                
                pred_text = decode(pred_ids)
                ref_text = decode(ref_ids)
                
                hyps.append(pred_text)
                refs.append(ref_text)
            
            tgt_input = tgt[:, :-1]
            tgt_target = tgt[:, 1:]
            tgt_mask = make_causal_mask(tgt_input.size(1)).to(device)
            output = model(src, tgt_input, tgt_mask=tgt_mask)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_target.reshape(-1))
            
            non_pad = (tgt_target != vocab['<pad>']).sum().item()
            total_loss += loss.item() * non_pad
            total_tokens += non_pad
    
    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    rouge_l = sum(scorer.score(r, h)['rougeL'].fmeasure for r, h in zip(refs, hyps)) / len(refs) * 100
    chrf = sacrebleu.corpus_chrf(hyps, [refs])
    ppl = math.exp(total_loss / total_tokens) if total_tokens > 0 else float('inf')
    
    return {'bleu': bleu.score, 'rouge_l': rouge_l, 'chrf': chrf.score, 'ppl': ppl, 'samples': list(zip(refs[:5], hyps[:5]))}

In [61]:
EPOCHS = 20
best_bleu = 0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    metrics = evaluate(model, val_loader, max_samples=500, decode_method='greedy')
    
    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val BLEU: {metrics['bleu']:.2f}, ROUGE-L: {metrics['rouge_l']:.2f}, chrF: {metrics['chrf']:.2f}, PPL: {metrics['ppl']:.2f}")
    
    if metrics['bleu'] > best_bleu:
        best_bleu = metrics['bleu']
        torch.save({
            'model': model.state_dict(),
            'vocab': vocab,
            'inv_vocab': inv_vocab,
            'metrics': metrics
        }, 'best_model.pt')
        print(f"  *** Saved best model (BLEU: {best_bleu:.2f}) ***")
    
    if epoch % 2 == 0:
        print("\n  Sample predictions:")
        for ref, hyp in metrics['samples'][:3]:
            print(f"    Ref: {ref}")
            print(f"    Hyp: {hyp}")
            print()

Epoch 1/20
  Train Loss: 4.9213
  Val BLEU: 1.21, ROUGE-L: 12.93, chrF: 12.43, PPL: 82.23
  *** Saved best model (BLEU: 1.21) ***
Epoch 2/20
  Train Loss: 4.2549
  Val BLEU: 1.46, ROUGE-L: 12.82, chrF: 11.56, PPL: 69.29
  *** Saved best model (BLEU: 1.46) ***

  Sample predictions:
    Ref: 10 and 13
    Hyp: i am not sure . i am so i am so i am so sorry .

    Ref: that 's sounds nice and romantic
    Hyp: i 'm sure you will be a lot of people .

    Ref: because girls find it more attractive . in my experience
    Hyp: i was a lot of people who did n't get it .

Epoch 3/20
  Train Loss: 4.0822
  Val BLEU: 1.65, ROUGE-L: 14.41, chrF: 12.05, PPL: 63.58
  *** Saved best model (BLEU: 1.65) ***
Epoch 4/20
  Train Loss: 3.9645
  Val BLEU: 1.70, ROUGE-L: 14.55, chrF: 11.80, PPL: 59.42
  *** Saved best model (BLEU: 1.70) ***

  Sample predictions:
    Ref: 10 and 13
    Hyp: i am so happy for you .

    Ref: that 's sounds nice and romantic
    Hyp: i 'm sure it was a good thing .

    Ref: 

In [62]:
checkpoint = torch.load('/kaggle/working/best_model.pt')
model.load_state_dict(checkpoint['model'])

print("=== GREEDY DECODING ===")
test_metrics_greedy = evaluate(model, test_loader, max_samples=1000, decode_method='greedy')
print(f"BLEU: {test_metrics_greedy['bleu']:.2f}")
print(f"ROUGE-L: {test_metrics_greedy['rouge_l']:.2f}")
print(f"chrF: {test_metrics_greedy['chrf']:.2f}")
print(f"Perplexity: {test_metrics_greedy['ppl']:.2f}")

print("\n=== BEAM SEARCH (width=3) ===")
test_metrics_beam = evaluate(model, test_loader, max_samples=None, decode_method='beam', beam_width=3)
print(f"BLEU: {test_metrics_beam['bleu']:.2f}")
print(f"ROUGE-L: {test_metrics_beam['rouge_l']:.2f}")
print(f"chrF: {test_metrics_beam['chrf']:.2f}")
print(f"Perplexity: {test_metrics_beam['ppl']:.2f}")

print("\n=== GREEDY Sample Outputs ===")
for i, (ref, hyp) in enumerate(test_metrics_greedy['samples'], 1):
    print(f"\nExample {i}:")
    print(f"Reference: {ref}")
    print(f"Generated: {hyp}")

print("\n=== BEAM SEARCH Sample Outputs ===")
for i, (ref, hyp) in enumerate(test_metrics_beam['samples'], 1):
    print(f"\nExample {i}:")
    print(f"Reference: {ref}")
    print(f"Generated: {hyp}")

=== GREEDY DECODING ===
BLEU: 2.48
ROUGE-L: 15.01
chrF: 13.47
Perplexity: 49.25

=== BEAM SEARCH (width=3) ===
BLEU: 2.05
ROUGE-L: 14.44
chrF: 11.14
Perplexity: 49.07

=== GREEDY Sample Outputs ===

Example 1:
Reference: well have you been living a happy life ? she would probably be happy about that .
Generated: i 'm sorry to hear that . i hope you have a good memories of her .

Example 2:
Reference: ah man ! you should n't feel bad about tripping . why do you feel bad about it ?
Generated: oh no ! did you hurt yourself ?

Example 3:
Reference: it is , and yeah until they are over the age of you go go every other month crazy poor babies ! !
Generated: she was a very good idea .

Example 4:
Reference: awww ... still thats upsetting
Generated: i 'm sorry to hear that . what 's the job is it ?

Example 5:
Reference: yes , but it is crazy expensive to go .
Generated: i 'm going to buy a lot of money .

=== BEAM SEARCH Sample Outputs ===

Example 1:
Reference: well have you been living a ha

In [63]:
!pip install streamlit -q

In [64]:
app_code = '''import streamlit as st
import torch
import torch.nn as nn
import math
from nltk.tokenize import word_tokenize

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer(\'pe\', pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, q, k, v, mask=None):
        B = q.size(0)
        q = self.q_linear(q).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float(\'-1e9\'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(B, -1, self.num_heads * self.d_k)
        return self.out(out), attn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask=None):
        attn_out, _ = self.attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        self_attn_out, self_attn_weights = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_out))
        cross_attn_out, cross_attn_weights = self.cross_attn(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(cross_attn_out))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x, cross_attn_weights

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, num_heads=2, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.pos_enc(self.embedding(src) * math.sqrt(self.d_model))
        tgt = self.pos_enc(self.embedding(tgt) * math.sqrt(self.d_model))
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        for layer in self.decoder_layers:
            tgt, _ = layer(tgt, src, src_mask, tgt_mask)
        return self.out(tgt)

@st.cache_resource
def load_model():
    checkpoint = torch.load(\'best_model.pt\', map_location=\'cpu\')
    vocab = checkpoint[\'vocab\']
    inv_vocab = checkpoint[\'inv_vocab\']
    model = Transformer(len(vocab))
    model.load_state_dict(checkpoint[\'model\'])
    model.eval()
    return model, vocab, inv_vocab

def encode(text, vocab):
    tokens = [\'<bos>\'] + word_tokenize(text.lower()) + [\'<eos>\']
    return [vocab.get(t, vocab[\'<unk>\']) for t in tokens]

def decode(ids, inv_vocab):
    words = [inv_vocab.get(i, \'<unk>\') for i in ids]
    words = [w for w in words if w not in [\'<pad>\', \'<bos>\', \'<eos>\', \'<unk>\', \'<sep>\'] and not w.startswith(\'<emotion_\')]
    return \' \'.join(words)

def make_causal_mask(size):
    return torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(0)

def generate(model, src_text, vocab, inv_vocab, method=\'greedy\', beam_width=3, max_len=50):
    src_ids = torch.tensor([encode(src_text, vocab)])
    
    with torch.no_grad():
        enc = model.pos_enc(model.embedding(src_ids) * math.sqrt(model.d_model))
        for layer in model.encoder_layers:
            enc = layer(enc)
        
        if method == \'greedy\':
            ys = torch.tensor([[vocab[\'<bos>\']]])
            for _ in range(max_len):
                tgt_mask = make_causal_mask(ys.size(1))
                tgt_emb = model.pos_enc(model.embedding(ys) * math.sqrt(model.d_model))
                for layer in model.decoder_layers:
                    tgt_emb, _ = layer(tgt_emb, enc, tgt_mask=tgt_mask)
                logits = model.out(tgt_emb[:, -1, :])
                next_token = logits.argmax(dim=-1).unsqueeze(0)
                ys = torch.cat([ys, next_token], dim=1)
                if next_token.item() == vocab[\'<eos>\']:
                    break
            return decode(ys.squeeze(0).tolist(), inv_vocab)
        
        else:
            beams = [(torch.tensor([[vocab[\'<bos>\']]]), 0.0)]
            for _ in range(max_len):
                new_beams = []
                for seq, score in beams:
                    if seq[0, -1].item() == vocab[\'<eos>\']:
                        new_beams.append((seq, score))
                        continue
                    tgt_mask = make_causal_mask(seq.size(1))
                    tgt_emb = model.pos_enc(model.embedding(seq) * math.sqrt(model.d_model))
                    for layer in model.decoder_layers:
                        tgt_emb, _ = layer(tgt_emb, enc, tgt_mask=tgt_mask)
                    logits = model.out(tgt_emb[:, -1, :])
                    log_probs = torch.log_softmax(logits, dim=-1)
                    top_probs, top_indices = log_probs.topk(beam_width)
                    for i in range(beam_width):
                        next_token = top_indices[0, i].unsqueeze(0).unsqueeze(0)
                        next_score = score + top_probs[0, i].item()
                        next_seq = torch.cat([seq, next_token], dim=1)
                        new_beams.append((next_seq, next_score))
                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
                if all(seq[0, -1].item() == vocab[\'<eos>\'] for seq, _ in beams):
                    break
            return decode(beams[0][0].squeeze(0).tolist(), inv_vocab)

st.title("🤖 Empathetic Chatbot")
st.markdown("Transformer-based empathetic conversational agent")

model, vocab, inv_vocab = load_model()

if \'history\' not in st.session_state:
    st.session_state.history = []

with st.sidebar:
    st.header("Settings")
    emotion = st.selectbox("Emotion", [\'afraid\', \'angry\', \'annoyed\', \'anxious\', \'sad\', \'happy\', \'excited\', \'grateful\', \'proud\', \'surprised\'])
    situation = st.text_area("Situation (optional)", "")
    method = st.radio("Decoding", [\'greedy\', \'beam\'])
    if st.button("Clear History"):
        st.session_state.history = []

user_input = st.chat_input("Type your message...")

if user_input:
    sit = situation if situation else "general conversation"
    input_text = f"Emotion: {emotion} | Situation: {sit} | Customer: {user_input} Agent:"
    
    with st.spinner("Thinking..."):
        response = generate(model, input_text, vocab, inv_vocab, method=method)
    
    st.session_state.history.append((\'user\', user_input))
    st.session_state.history.append((\'bot\', response))

for role, msg in st.session_state.history:
    with st.chat_message(role):
        st.write(msg)
'''

with open('app.py', 'w', encoding='utf-8') as f:
    f.write(app_code)

print("Streamlit app saved to app.py")
print("Run with: streamlit run app.py")

Streamlit app saved to app.py
Run with: streamlit run app.py


In [65]:
report = f'''# Empathetic Chatbot Evaluation Report

## Model Architecture
- Transformer encoder-decoder (built from scratch)
- Embedding dimension: 512
- Attention heads: 2
- Encoder/Decoder layers: 2 each
- Dropout: 0.1
- Vocabulary size: {len(vocab)}
- Total parameters: {sum(p.numel() for p in model.parameters()):,}

## Dataset Split
- Train: {len(train_df)} samples (80%)
- Validation: {len(val_df)} samples (10%)
- Test: {len(test_df)} samples (10%)

## Training Configuration
- Optimizer: Adam (lr=1e-4, betas=(0.9, 0.98))
- Batch size: 64
- Loss: CrossEntropyLoss (ignore padding)
- Teacher forcing: Yes
- Epochs: {EPOCHS}
- Best model selection: Validation BLEU

## Test Set Results (Full Dataset)

### Greedy Decoding
- BLEU: {test_metrics_greedy["bleu"]:.2f}
- ROUGE-L: {test_metrics_greedy["rouge_l"]:.2f}
- chrF: {test_metrics_greedy["chrf"]:.2f}
- Perplexity: {test_metrics_greedy["ppl"]:.2f}

### Beam Search (width=3)
- BLEU: {test_metrics_beam["bleu"]:.2f}
- ROUGE-L: {test_metrics_beam["rouge_l"]:.2f}
- chrF: {test_metrics_beam["chrf"]:.2f}
- Perplexity: {test_metrics_beam["ppl"]:.2f}

## Qualitative Examples (Greedy)
'''

for i, (ref, hyp) in enumerate(test_metrics_greedy['samples'], 1):
    report += f'''\n### Example {i}
**Reference:** {ref}
**Generated:** {hyp}
'''

report += '\n## Qualitative Examples (Beam Search)\n'

for i, (ref, hyp) in enumerate(test_metrics_beam['samples'], 1):
    report += f'''\n### Example {i}
**Reference:** {ref}
**Generated:** {hyp}
'''

report += '''\n## Implementation Details
- Multi-head attention with residual connections and layer normalization
- Sinusoidal positional encoding
- Causal masking in decoder self-attention
- Greedy and beam search decoding (beam width=3)
- Special tokens: <pad>, <bos>, <eos>, <unk>, <sep>, <emotion_X>

## Deployment
- Framework: Streamlit
- Features: Interactive chat, emotion selection, conversation history, decoding method selection
- Run: `streamlit run app.py`
'''

with open('/kaggle/working/EVALUATION_REPORT.md', 'w', encoding='utf-8') as f:
    f.write(report)

print("Evaluation report saved to /kaggle/working/EVALUATION_REPORT.md")

Evaluation report saved to /kaggle/working/EVALUATION_REPORT.md


In [66]:
readme = '''# Empathetic Conversational Chatbot

Transformer encoder-decoder model for empathetic dialogue generation.

## Setup
```bash
pip install torch pandas numpy nltk scikit-learn sacrebleu rouge-score streamlit
```

## Training
Run all cells in `project2_complete.ipynb`

## Inference
```bash
streamlit run app.py
```

## Files
- `project2_complete.ipynb`: Complete training pipeline
- `app.py`: Streamlit chatbot interface
- `best_model.pt`: Trained model checkpoint
- `EVALUATION_REPORT.md`: Metrics and analysis

## Model Architecture
- Transformer encoder-decoder (from scratch)
- 512-dim embeddings, 2 heads, 2 layers
- Positional encoding, multi-head attention, residual connections
- Teacher forcing during training
- Greedy and beam search decoding

## Dataset
Empathetic Dialogues (Kaggle)
- Input: Emotion + Situation + Customer utterance
- Output: Agent empathetic reply
- Split: 80/10/10 train/val/test

## Results
See `EVALUATION_REPORT.md` for detailed metrics and examples.
'''

with open('README.md', 'w', encoding='utf-8') as f:
    f.write(readme)

print("README.md created")

README.md created
