In [3]:
import numpy as np
import argparse
from shutil import copyfile
from create_data import createData
from create_data import divideData

In [6]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def add_dialogue_acts(dialogue_acts, data_files):
    for file_path in data_files:
        data = load_json(file_path)
        
        for dialogue in data:
            dialogue_idx = dialogue['dialogue_idx'].replace('.json', '')

            if dialogue_idx in dialogue_acts:
                acts = dialogue_acts[dialogue_idx]

                for turn in dialogue['dialogue']:
                    turn_idx = str(turn['turn_idx'])
                    if turn_idx in acts:
                        turn['dialogue_act'] = acts[turn_idx]

        save_json(data, file_path)


In [7]:
dialogue_acts_path = '/kaggle/input/multiwoz2-4/dialogue_acts.json'
data_files = ['/kaggle/working/train_dials.json', '/kaggle/working/dev_dials.json', '/kaggle/working/test_dials.json']

dialogue_acts = load_json(dialogue_acts_path)

add_dialogue_acts(dialogue_acts, data_files)

print("Dialogue acts have been added to all data files.")

Dialogue acts have been added to all data files.


# Joint Act Classification and Slot Filling

In [8]:
import torch
from torch import nn
from transformers import BertPreTrainedModel, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm
import json


from transformers import BertTokenizer, BertForTokenClassification, AdamW
from sklearn.metrics import classification_report


In [None]:
class JointDialogueDataset(Dataset):
    def __init__(self, file_path, tokenizer, label2id=None, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        
        # For dialogue acts
        self.act_binarizer = MultiLabelBinarizer()
        
        if self.label2id is None:
            unique_labels = {'O'}
            with open(file_path, 'r') as f:
                data = json.load(f)
                for dialogue in data:
                    for turn in dialogue['dialogue']:
                        labels = turn.get('turn_label', [])
                        for slot_name, _ in labels:
                            unique_labels.add(f'B-{slot_name}')
                            unique_labels.add(f'I-{slot_name}')
            
            self.label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
        
        self.id2label = {idx: label for label, idx in self.label2id.items()}
        self.data = self.load_and_process_data(file_path)
        
    def load_and_process_data(self, file_path):
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # First pass: collect all dialogue acts for binarizer fitting
        all_acts = []
        for dialogue in data:
            for turn in dialogue['dialogue']:
                acts = turn.get('dialogue_act', None)
                if isinstance(acts, dict):
                    all_acts.append(list(acts.keys()))
                elif isinstance(acts, str):
                    all_acts.append([acts])
                else:
                    all_acts.append([])
    
        self.act_binarizer.fit(all_acts)
        
        # Second pass: process the data
        processed_data = []
        for dialogue_idx, dialogue in enumerate(data):
            for turn in dialogue['dialogue']:
                text = turn['transcript']
                slot_labels = turn.get('turn_label', [])
                
                
                acts = turn.get('dialogue_act', None) # Process dialogue acts
                if isinstance(acts, dict):
                    act_labels = list(acts.keys())
                elif isinstance(acts, str):
                    act_labels = [acts]
                else:
                    act_labels = []
                act_binary = self.act_binarizer.transform([act_labels])[0]
                
                tokens = self.tokenizer.tokenize(text)
                
                token_labels = ['O'] * len(tokens)
                
                
                for slot_name, slot_value in slot_labels: # Map slots to BIO tags
                    slot_tokens = self.tokenizer.tokenize(slot_value)
                    
                    # Find slot tokens in text
                    for i in range(len(tokens)):
                        if tokens[i:i+len(slot_tokens)] == slot_tokens:
                            token_labels[i] = f'B-{slot_name}'
                            for j in range(1, len(slot_tokens)):
                                if i+j < len(token_labels):
                                    token_labels[i+j] = f'I-{slot_name}'
                
                encoding = self.tokenizer(
                    text,
                    padding='max_length',
                    truncation=True,
                    max_length=self.max_len,
                    return_tensors='pt'
                )
                
                label_ids = ['O']  # [CLS]
                label_ids.extend(token_labels)
                label_ids.append('O')  # [SEP]
                
                # Pad to max length
                if len(label_ids) < self.max_len:
                    label_ids.extend(['O'] * (self.max_len - len(label_ids)))
                else:
                    label_ids = label_ids[:self.max_len]
                
                # Convert string labels to IDs
                label_ids = [self.label2id[label] for label in label_ids]
                labels_tensor = torch.tensor(label_ids)
                
                processed_data.append({
                    'input_ids': encoding['input_ids'][0],
                    'attention_mask': encoding['attention_mask'][0],
                    'act_labels': torch.tensor(act_binary, dtype=torch.float),
                    'slot_labels': labels_tensor
                })
        
        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class JointBertModel(BertPreTrainedModel):
    def __init__(self, config, num_act_labels, num_slot_labels):
        super().__init__(config)
        self.num_act_labels = num_act_labels
        self.num_slot_labels = num_slot_labels
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.act_classifier = nn.Linear(config.hidden_size, num_act_labels)
        self.slot_classifier = nn.Linear(config.hidden_size, num_slot_labels)
        
        self.act_loss = nn.BCEWithLogitsLoss()
        self.slot_loss = nn.CrossEntropyLoss()
        
        self.init_weights()

    def forward(self, input_ids, attention_mask, act_labels=None, slot_labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        pooled_output = outputs[1]
    
        # Dialogue Act predict
        pooled_output = self.dropout(pooled_output)
        act_logits = self.act_classifier(pooled_output)
        
        # Slot Filling predict
        sequence_output = self.dropout(sequence_output)
        slot_logits = self.slot_classifier(sequence_output)
    
        outputs = {'act_logits': act_logits, 'slot_logits': slot_logits}
        
        if act_labels is not None and slot_labels is not None:
            act_loss = self.act_loss(act_logits, act_labels).mean()  #  scalar
            slot_loss = self.slot_loss(
                slot_logits.view(-1, self.num_slot_labels), 
                slot_labels.view(-1)
            ).mean()  
    
            # Weight total loss
            total_loss = 0.75 * act_loss + 0.25 * slot_loss
            outputs['loss'] = total_loss
    
        return outputs

In [None]:
def evaluate_joint_model(model, eval_dataloader, device, label2id, act_binarizer, threshold=0.5):
    model.eval()
    act_true = []
    act_pred = []
    slot_true = []
    slot_pred = []
    
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            act_labels = batch['act_labels'].to(device)
            slot_labels = batch['slot_labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # dialogue act predictions
            act_logits = torch.sigmoid(outputs['act_logits'])
            act_predictions = (act_logits >= threshold).cpu().numpy()
            act_true.extend(act_labels.cpu().numpy())
            act_pred.extend(act_predictions)
            
            # slot predictions
            slot_logits = outputs['slot_logits']
            slot_predictions = torch.argmax(slot_logits, dim=2)

            active_accuracy = attention_mask.view(-1) == 1
            slot_true.extend(slot_labels.view(-1)[active_accuracy].cpu().numpy())
            slot_pred.extend(slot_predictions.view(-1)[active_accuracy].cpu().numpy())

    results = {
        'act_f1_micro': f1_score(act_true, act_pred, average='micro'),
        'act_f1_macro': f1_score(act_true, act_pred, average='macro'),
        'slot_f1_micro': f1_score(slot_true, slot_pred, average='micro'),
        'slot_f1_macro': f1_score(slot_true, slot_pred, average='macro')
    }
    
    return results

In [9]:
def train_joint_model(train_dataloader, eval_dataloader, model, device,  label2id, act_binarizer, num_epochs=20):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    best_f1 = 0
    base_model = model.module if hasattr(model, 'module') else model
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc="Training")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            act_labels = batch['act_labels'].to(device)
            slot_labels = batch['slot_labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                act_labels=act_labels,
                slot_labels=slot_labels
            )
            
            loss = outputs['loss']
            if loss.dim() > 0:  # Reduce multi-element tensor to a scalar
                loss = loss.mean()
            
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

        eval_results = evaluate_joint_model(
            model, eval_dataloader, device, label2id, act_binarizer)
        
        print("\nEvaluation Results:")
        for metric, value in eval_results.items():
            print(f"{metric}: {value:.4f}")
        
        # Saving
        avg_f1 = sum(eval_results.values()) / len(eval_results)
        if avg_f1 > best_f1:
            best_f1 = avg_f1
            base_model.save_pretrained('./best_joint_model')
            print(f"New best model saved with average F1: {best_f1:.4f}")

# Training

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [12]:
train_dataset = JointDialogueDataset('train_dials.json', tokenizer)
eval_dataset = JointDialogueDataset('dev_dials.json', tokenizer, label2id=train_dataset.label2id)


In [13]:
train_dataset.label2id

{'B-attraction-area': 0,
 'B-attraction-name': 1,
 'B-attraction-type': 2,
 'B-hospital-department': 3,
 'B-hotel-area': 4,
 'B-hotel-book day': 5,
 'B-hotel-book people': 6,
 'B-hotel-book stay': 7,
 'B-hotel-internet': 8,
 'B-hotel-name': 9,
 'B-hotel-parking': 10,
 'B-hotel-pricerange': 11,
 'B-hotel-stars': 12,
 'B-hotel-type': 13,
 'B-restaurant-area': 14,
 'B-restaurant-book day': 15,
 'B-restaurant-book people': 16,
 'B-restaurant-book time': 17,
 'B-restaurant-food': 18,
 'B-restaurant-name': 19,
 'B-restaurant-pricerange': 20,
 'B-taxi-arriveby': 21,
 'B-taxi-departure': 22,
 'B-taxi-destination': 23,
 'B-taxi-leaveat': 24,
 'B-train-arriveby': 25,
 'B-train-book people': 26,
 'B-train-day': 27,
 'B-train-departure': 28,
 'B-train-destination': 29,
 'B-train-leaveat': 30,
 'I-attraction-area': 31,
 'I-attraction-name': 32,
 'I-attraction-type': 33,
 'I-hospital-department': 34,
 'I-hotel-area': 35,
 'I-hotel-book day': 36,
 'I-hotel-book people': 37,
 'I-hotel-book stay': 38,


In [14]:
train_dataset.act_binarizer.classes_

array(['Attraction-Inform', 'Attraction-NoOffer', 'Attraction-Recommend',
       'Attraction-Request', 'Attraction-Select', 'Booking-Book',
       'Booking-Inform', 'Booking-NoBook', 'Booking-Request',
       'Hotel-Inform', 'Hotel-NoOffer', 'Hotel-Recommend',
       'Hotel-Request', 'Hotel-Select', 'No Annotation',
       'Restaurant-Inform', 'Restaurant-NoOffer', 'Restaurant-Recommend',
       'Restaurant-Request', 'Restaurant-Select', 'Taxi-Inform',
       'Taxi-Request', 'Train-Inform', 'Train-NoOffer', 'Train-OfferBook',
       'Train-OfferBooked', 'Train-Request', 'Train-Select',
       'general-bye', 'general-greet', 'general-reqmore',
       'general-welcome'], dtype=object)

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=256)

In [16]:
model = JointBertModel.from_pretrained(
    'bert-base-uncased',
    num_act_labels=len(train_dataset.act_binarizer.classes_),
    num_slot_labels=len(train_dataset.label2id)
).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of JointBertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['act_classifier.bias', 'act_classifier.weight', 'slot_classifier.bias', 'slot_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)

Using 2 GPUs!


In [18]:
train_joint_model(
    train_dataloader, 
    eval_dataloader, 
    model, 
    device,
    train_dataset.label2id,
    train_dataset.act_binarizer
)


Epoch 1/20


Training: 100%|██████████| 222/222 [10:37<00:00,  2.87s/it, loss=0.1326]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.0000
act_f1_macro: 0.0000
slot_f1_micro: 0.9134
slot_f1_macro: 0.0209
New best model saved with average F1: 0.2336

Epoch 2/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.1348]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.0000
act_f1_macro: 0.0000
slot_f1_micro: 0.9374
slot_f1_macro: 0.2024
New best model saved with average F1: 0.2849

Epoch 3/20


Training: 100%|██████████| 222/222 [10:44<00:00,  2.90s/it, loss=0.1042]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.1127
act_f1_macro: 0.0172
slot_f1_micro: 0.9557
slot_f1_macro: 0.4136
New best model saved with average F1: 0.3748

Epoch 4/20


Training: 100%|██████████| 222/222 [10:44<00:00,  2.90s/it, loss=0.0962]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.2885
act_f1_macro: 0.0713
slot_f1_micro: 0.9654
slot_f1_macro: 0.5226
New best model saved with average F1: 0.4619

Epoch 5/20


Training: 100%|██████████| 222/222 [10:45<00:00,  2.91s/it, loss=0.0914]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.3724
act_f1_macro: 0.1081
slot_f1_micro: 0.9726
slot_f1_macro: 0.6277
New best model saved with average F1: 0.5202

Epoch 6/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0847]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.4284
act_f1_macro: 0.1603
slot_f1_micro: 0.9778
slot_f1_macro: 0.7005
New best model saved with average F1: 0.5668

Epoch 7/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.89s/it, loss=0.0852]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.4938
act_f1_macro: 0.2356
slot_f1_micro: 0.9794
slot_f1_macro: 0.7329
New best model saved with average F1: 0.6104

Epoch 8/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0744]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5220
act_f1_macro: 0.2881
slot_f1_micro: 0.9808
slot_f1_macro: 0.7613
New best model saved with average F1: 0.6380

Epoch 9/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.90s/it, loss=0.0756]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5199
act_f1_macro: 0.2858
slot_f1_micro: 0.9814
slot_f1_macro: 0.7838
New best model saved with average F1: 0.6427

Epoch 10/20


Training: 100%|██████████| 222/222 [10:44<00:00,  2.90s/it, loss=0.0646]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5261
act_f1_macro: 0.2923
slot_f1_micro: 0.9824
slot_f1_macro: 0.8136
New best model saved with average F1: 0.6536

Epoch 11/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.89s/it, loss=0.0565]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5368
act_f1_macro: 0.3204
slot_f1_micro: 0.9825
slot_f1_macro: 0.8192
New best model saved with average F1: 0.6647

Epoch 12/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.90s/it, loss=0.0645]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5408
act_f1_macro: 0.3352
slot_f1_micro: 0.9834
slot_f1_macro: 0.8258
New best model saved with average F1: 0.6713

Epoch 13/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.90s/it, loss=0.0580]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.5472
act_f1_macro: 0.3520
slot_f1_micro: 0.9834
slot_f1_macro: 0.8335
New best model saved with average F1: 0.6790

Epoch 14/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0613]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.5421
act_f1_macro: 0.3495
slot_f1_micro: 0.9840
slot_f1_macro: 0.8424
New best model saved with average F1: 0.6795

Epoch 15/20


Training: 100%|██████████| 222/222 [10:41<00:00,  2.89s/it, loss=0.0519]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.5452
act_f1_macro: 0.3628
slot_f1_micro: 0.9836
slot_f1_macro: 0.8201

Epoch 16/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0531]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5542
act_f1_macro: 0.3617
slot_f1_micro: 0.9839
slot_f1_macro: 0.8254
New best model saved with average F1: 0.6813

Epoch 17/20


Training: 100%|██████████| 222/222 [10:42<00:00,  2.89s/it, loss=0.0535]
Evaluating: 100%|██████████| 29/29 [00:28<00:00,  1.00it/s]



Evaluation Results:
act_f1_micro: 0.5466
act_f1_macro: 0.3667
slot_f1_micro: 0.9844
slot_f1_macro: 0.8315
New best model saved with average F1: 0.6823

Epoch 18/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0569]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5513
act_f1_macro: 0.3674
slot_f1_micro: 0.9841
slot_f1_macro: 0.8306
New best model saved with average F1: 0.6833

Epoch 19/20


Training: 100%|██████████| 222/222 [10:45<00:00,  2.91s/it, loss=0.0478]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5483
act_f1_macro: 0.3748
slot_f1_micro: 0.9843
slot_f1_macro: 0.8318
New best model saved with average F1: 0.6848

Epoch 20/20


Training: 100%|██████████| 222/222 [10:43<00:00,  2.90s/it, loss=0.0561]
Evaluating: 100%|██████████| 29/29 [00:29<00:00,  1.00s/it]



Evaluation Results:
act_f1_micro: 0.5588
act_f1_macro: 0.3847
slot_f1_micro: 0.9846
slot_f1_macro: 0.8334
New best model saved with average F1: 0.6904


# Evaluating

In [10]:
import torch
from transformers import BertTokenizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
import random
import numpy as np

In [11]:
def load_and_evaluate_model(test_file, model_path, train_dataset, device):
    model = JointBertModel.from_pretrained(
        model_path,
        num_act_labels=len(train_dataset.act_binarizer.classes_),
        num_slot_labels=len(train_dataset.label2id)
    ).to(device)
    model.eval()

    test_dataset = JointDialogueDataset(
        test_file, 
        train_dataset.tokenizer,
        label2id=train_dataset.label2id
    )
    test_dataloader = DataLoader(test_dataset, batch_size=512)

    act_true = []
    act_pred = []
    slot_true = []
    slot_pred = []
    attention_masks = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Act predict
            act_logits = torch.sigmoid(outputs['act_logits'])
            act_predictions = (act_logits >= 0.5).cpu().numpy()
            act_true.extend(batch['act_labels'].numpy())
            act_pred.extend(act_predictions)
            
            # Slot predict
            slot_logits = outputs['slot_logits']
            slot_predictions = torch.argmax(slot_logits, dim=2).cpu().numpy()
            slot_true.extend(batch['slot_labels'].numpy())
            slot_pred.extend(slot_predictions)
            attention_masks.extend(batch['attention_mask'].numpy())

    # classification report
    # dialogue acts
    act_class_names = train_dataset.act_binarizer.classes_
    act_true_flat = np.array(act_true)
    act_pred_flat = np.array(act_pred)

    print("\n=== Dialogue Act Classification Results ===")
    print("\nClassification Report:")
    print(classification_report(act_true_flat, act_pred_flat, target_names=act_class_names))
    
    act_accuracy = accuracy_score(act_true_flat.flatten(), act_pred_flat.flatten())
    act_f1_micro = f1_score(act_true_flat, act_pred_flat, average='micro')
    act_f1_macro = f1_score(act_true_flat, act_pred_flat, average='macro')
    
    print(f"\nOverall Dialogue Act Metrics:")
    print(f"Accuracy: {act_accuracy:.4f}")
    print(f"Micro F1: {act_f1_micro:.4f}")
    print(f"Macro F1: {act_f1_macro:.4f}")

    # slot filling
    slot_true_flat = []
    slot_pred_flat = []
    id2label = {v: k for k, v in train_dataset.label2id.items()}
    
    for true, pred, mask in zip(slot_true, slot_pred, attention_masks):
        for t, p, m in zip(true, pred, mask):
            if m == 1:  # Only evaluate non-padded tokens
                slot_true_flat.append(t)
                slot_pred_flat.append(p)

    sorted_labels = [id2label[i] for i in range(len(id2label))]
    
    print("\n=== Slot Filling Results ===")
    print("\nClassification Report:")
    print(classification_report(
        slot_true_flat, 
        slot_pred_flat, 
        labels=range(len(sorted_labels)),  # Explicitly specify labels range
        target_names=sorted_labels
    ))
    
    slot_accuracy = accuracy_score(slot_true_flat, slot_pred_flat)
    slot_f1_micro = f1_score(slot_true_flat, slot_pred_flat, average='micro')
    slot_f1_macro = f1_score(slot_true_flat, slot_pred_flat, average='macro')
    
    print(f"\nOverall Slot Filling Metrics:")
    print(f"Accuracy: {slot_accuracy:.4f}")
    print(f"Micro F1: {slot_f1_micro:.4f}")
    print(f"Macro F1: {slot_f1_macro:.4f}")


    print("\n=== Sample Predictions ===")
    random_indices = random.sample(range(len(test_dataset)), 10)
    
    for idx in random_indices:
        sample = test_dataset[idx]
        input_ids = sample['input_ids'].unsqueeze(0).to(device)
        attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # dialogue act predict
        act_logits = torch.sigmoid(outputs['act_logits'])
        act_predictions = (act_logits >= 0.5).cpu().numpy()[0]
        predicted_acts = train_dataset.act_binarizer.classes_[act_predictions.astype(bool)]
        true_acts = train_dataset.act_binarizer.classes_[sample['act_labels'].numpy().astype(bool)]
        
        # slot predict
        slot_logits = outputs['slot_logits']
        slot_predictions = torch.argmax(slot_logits, dim=2).cpu().numpy()[0]
    
        text = train_dataset.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
        
        id2label = {v: k for k, v in train_dataset.label2id.items()}
        
        # Add any missing labels to id2label with a placeholder. THIS IS A TEMPORARY FIX
        # For using labels not of training
        for i in range(63): 
            if i not in id2label:
                id2label[i] = f"UNK_{i}"  
        
        true_slots = [
            id2label.get(true.item(), f"UNK_{true.item()}")  #  avoid KeyError
            for i, (true, mask) in enumerate(zip(sample['slot_labels'], attention_mask.cpu().numpy()[0]))
            if mask == 1  
        ][1:-1]  
        
        pred_slots = [
            id2label.get(pred, f"UNK_{pred}")  #  avoid KeyError
            for i, (pred, mask) in enumerate(zip(slot_predictions, attention_mask.cpu().numpy()[0]))
            if mask == 1
        ][1:-1] 
        
        print(f"\nSample {idx + 1}:")
        print(f"Text: {text}")
        print(f"True Dialogue Acts: {list(true_acts)}")
        print(f"Predicted Dialogue Acts: {list(predicted_acts)}")
        print("True Slots:", true_slots)
        print("Predicted Slots:", pred_slots)
        print("-" * 80)

    return {
        'act_accuracy': act_accuracy,
        'act_f1_micro': act_f1_micro,
        'act_f1_macro': act_f1_macro,
        'slot_accuracy': slot_accuracy,
        'slot_f1_micro': slot_f1_micro,
        'slot_f1_macro': slot_f1_macro
    }



In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = JointDialogueDataset('train_dials.json', tokenizer)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [14]:
test_dataset = JointDialogueDataset('test_dials.json', tokenizer,label2id=train_dataset.label2id)


In [16]:
metrics = load_and_evaluate_model(
    test_file="test_dials.json",
    model_path="/kaggle/input/test/transformers/default/1",
    train_dataset=test_dataset,
    device=device
)

Evaluating: 100%|██████████| 15/15 [00:46<00:00,  3.11s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



=== Dialogue Act Classification Results ===

Classification Report:
                      precision    recall  f1-score   support

   Attraction-Inform       0.63      0.31      0.41       833
  Attraction-NoOffer       0.78      0.68      0.72        56
Attraction-Recommend       0.42      0.09      0.15       147
  Attraction-Request       0.46      0.42      0.44       159
   Attraction-Select       0.00      0.00      0.00        55
        Booking-Book       0.47      0.23      0.31       526
      Booking-Inform       0.63      0.51      0.56       558
      Booking-NoBook       0.85      0.66      0.75       131
     Booking-Request       0.79      0.65      0.71       321
        Hotel-Inform       0.61      0.51      0.55       809
       Hotel-NoOffer       0.49      0.57      0.53        65
     Hotel-Recommend       1.00      0.01      0.01       138
       Hotel-Request       0.64      0.60      0.62       280
        Hotel-Select       0.00      0.00      0.00        79


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Overall Slot Filling Metrics:
Accuracy: 0.9850
Micro F1: 0.9850
Macro F1: 0.8499

=== Sample Predictions ===

Sample 3301:
Text: okay, thanks. i also need a taxi to commute between these 2 place - s. i want to leave the hotel by 11 : 45.
True Dialogue Acts: ['Attraction-Inform']
Predicted Dialogue Acts: []
True Slots: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-taxi-leaveat', 'I-taxi-leaveat', 'I-taxi-leaveat', 'O']
Predicted Slots: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-taxi-leaveat', 'I-taxi-leaveat', 'I-taxi-leaveat', 'O']
--------------------------------------------------------------------------------

Sample 560:
Text: i need help finding an expensive place to eat in the west please.
True Dialogue Acts: []
Predicted Dialogue Acts: []
True Slots: ['O', 'O', 'O', 'O', 'O', 'B-restaurant-pricerange', 

In [19]:
metrics = load_and_evaluate_model(
    test_file="test_dials.json",
    model_path="/kaggle/input/test/transformers/default/1",
    train_dataset=test_dataset,
    device=device
)

Evaluating: 100%|██████████| 15/15 [01:01<00:00,  4.12s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



=== Dialogue Act Classification Results ===

Classification Report:
                      precision    recall  f1-score   support

   Attraction-Inform       0.63      0.31      0.41       833
  Attraction-NoOffer       0.78      0.68      0.72        56
Attraction-Recommend       0.42      0.09      0.15       147
  Attraction-Request       0.46      0.42      0.44       159
   Attraction-Select       0.00      0.00      0.00        55
        Booking-Book       0.47      0.23      0.31       526
      Booking-Inform       0.63      0.51      0.56       558
      Booking-NoBook       0.85      0.66      0.75       131
     Booking-Request       0.79      0.65      0.71       321
        Hotel-Inform       0.61      0.51      0.55       809
       Hotel-NoOffer       0.49      0.57      0.53        65
     Hotel-Recommend       1.00      0.01      0.01       138
       Hotel-Request       0.64      0.60      0.62       280
        Hotel-Select       0.00      0.00      0.00        79


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Overall Slot Filling Metrics:
Accuracy: 0.9850
Micro F1: 0.9850
Macro F1: 0.8499

=== Sample Predictions ===

Sample 6444:
Text: great, thanks. oh, and i do not actually need a booking for that train, so i think that s all i needed. sorry i keep confusing the matter.
True Dialogue Acts: ['Attraction-Inform']
Predicted Dialogue Acts: ['Train-Inform']
True Slots: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Slots: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------------------------------------------------------------------------------

Sample 4900:
Text: yes, please. i would like a reservation for 2 at 18 : 00 on friday.
True Dialogue Acts: ['Booking-Inform', 'Restaurant-Inform']
Predicted Dialogue Acts: ['Booking-Inform', 'Re