### THIS IS THE IMPROVEMENT VERSION OF DIALOGUEACT.IPYNB
by noticing that the dialogue-act classification is a Multi-label classification

In [1]:
import numpy as np
import argparse
from shutil import copyfile
from create_data import createData
from create_data import divideData

In [3]:
print('Create WOZ-like dialogues. Get yourself a coffee, this might take a while.')
args = argparse.Namespace(
    main_dir="/kaggle/input/multiwoz2-4",
    mwz_ver="2.4",
    target_path="/kaggle/working/"
)

delex_data = createData(args)
print('Divide dialogues...')
divideData(delex_data,args)

Create WOZ-like dialogues. Get yourself a coffee, this might take a while.
Divide dialogues...
# of dialogues: Train 8420, Val 1000, Test 999


In [4]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def add_dialogue_acts(dialogue_acts, data_files):
    for file_path in data_files:
        data = load_json(file_path)
        
        for dialogue in data:
            dialogue_idx = dialogue['dialogue_idx'].replace('.json', '')

            if dialogue_idx in dialogue_acts:
                acts = dialogue_acts[dialogue_idx]

                for turn in dialogue['dialogue']:
                    turn_idx = str(turn['turn_idx'])
                    if turn_idx in acts:
                        turn['dialogue_act'] = acts[turn_idx]

        save_json(data, file_path)

In [5]:
dialogue_acts_path = '/kaggle/input/multiwoz2-4/dialogue_acts.json'
data_files = ['/kaggle/working/train_dials.json', '/kaggle/working/dev_dials.json', '/kaggle/working/test_dials.json']

dialogue_acts = load_json(dialogue_acts_path)

add_dialogue_acts(dialogue_acts, data_files)

print("Dialogue acts have been added to all data files.")

Dialogue acts have been added to all data files.


# Dialogue Act mutli value

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score
import json
import numpy as np
from tqdm import tqdm

In [None]:
class DialogueActDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128):
        self.texts = [x for x, _ in samples]
        
        self.acts = []
        for _, y in samples:
            if isinstance(y, dict):
                self.acts.append(list(y.keys()))
            elif isinstance(y, str):
                self.acts.append([y])
            else:
                raise ValueError(f"Unexpected label type: {type(y)}. Expected dict or str.")
        
        self.label_binarizer = MultiLabelBinarizer()
        self.labels = self.label_binarizer.fit_transform(self.acts)
        
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [None]:
class BertForMultilabelClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.bce_loss = torch.nn.BCEWithLogitsLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            **kwargs
        )

        logits = self.classifier(outputs[0][:, 0, :])

        loss = None
        if labels is not None:
            loss = self.bce_loss(logits, labels)

        return torch.nn.functional.sigmoid(logits), loss

In [None]:
def load_and_process_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    samples = []
    for dialogue in data:
        for turn in dialogue['dialogue']:
            x = turn['system_transcript']
            y = turn.get('dialogue_act', None)
            if y:
                samples.append((x, y))
    return samples

In [None]:
def evaluate_model(model, eval_dataloader, device, threshold=0.5):
    model.eval()
    true_labels = []
    predicted_probs = []
    
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predicted_probs.extend(logits.cpu().numpy())
    
    
    predicted_labels = (np.array(predicted_probs) >= threshold).astype(int)
    true_labels = np.array(true_labels)
    
    f1_micro = f1_score(true_labels, predicted_labels, average='micro')
    f1_macro = f1_score(true_labels, predicted_labels, average='macro')
    
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'true_labels': true_labels,
        'predicted_labels': predicted_labels
    }

In [None]:
def predict_new_texts(texts, model, tokenizer, label_binarizer, device, threshold=0.5):
    model.eval()
    
    inputs = tokenizer(
        texts,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = (logits.cpu().numpy() >= threshold).astype(int)
        
    predicted_labels = label_binarizer.inverse_transform(predictions)
    
    return predicted_labels

In [7]:
def train_model(train_dataloader, eval_dataloader, model, device, num_epochs=20):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    best_f1 = 0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc="Training")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            _, loss = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        eval_results = evaluate_model(model, eval_dataloader, device)
        print("\nEvaluation Results:")
        print(f"Micro F1: {eval_results['f1_micro']:.4f}")
        print(f"Macro F1: {eval_results['f1_macro']:.4f}")
        
        # Saving
        if eval_results['f1_micro'] > best_f1:
            best_f1 = eval_results['f1_micro']
            model.save_pretrained('./model')
            print(f"New best model saved with Micro F1: {best_f1:.4f}")


# Training

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_samples = load_and_process_data("train_dials.json")
eval_samples = load_and_process_data("dev_dials.json")

Using device: cuda


In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = DialogueActDataset(train_samples, tokenizer)
eval_dataset = DialogueActDataset(eval_samples, tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [18]:
print(f"\nTraining samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")
print("\nUnique dialogue acts:", train_dataset.label_binarizer.classes_)
print(f"Number of unique labels: {len(train_dataset.label_binarizer.classes_)}")


Training samples: 48241
Evaluation samples: 6371

Unique dialogue acts: ['Attraction-Inform' 'Attraction-NoOffer' 'Attraction-Recommend'
 'Attraction-Request' 'Attraction-Select' 'Booking-Book' 'Booking-Inform'
 'Booking-NoBook' 'Booking-Request' 'Hotel-Inform' 'Hotel-NoOffer'
 'Hotel-Recommend' 'Hotel-Request' 'Hotel-Select' 'No Annotation'
 'Restaurant-Inform' 'Restaurant-NoOffer' 'Restaurant-Recommend'
 'Restaurant-Request' 'Restaurant-Select' 'Taxi-Inform' 'Taxi-Request'
 'Train-Inform' 'Train-NoOffer' 'Train-OfferBook' 'Train-OfferBooked'
 'Train-Request' 'Train-Select' 'general-bye' 'general-greet'
 'general-reqmore' 'general-welcome']
Number of unique labels: 32


In [19]:
num_labels = len(train_dataset.label_binarizer.classes_)
model = BertForMultilabelClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultilabelClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=128)

In [21]:
train_model(train_dataloader, eval_dataloader, model, device)


Epoch 1/20


Training: 100%|██████████| 377/377 [16:09<00:00,  2.57s/it, loss=0.0922]


Average training loss: 0.1691


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.7963
Macro F1: 0.4564
New best model saved with Micro F1: 0.7963

Epoch 2/20


Training: 100%|██████████| 377/377 [16:24<00:00,  2.61s/it, loss=0.0597]


Average training loss: 0.0738


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8594
Macro F1: 0.6752
New best model saved with Micro F1: 0.8594

Epoch 3/20


Training: 100%|██████████| 377/377 [16:25<00:00,  2.61s/it, loss=0.0578]


Average training loss: 0.0533


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8681
Macro F1: 0.7036
New best model saved with Micro F1: 0.8681

Epoch 4/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0447]


Average training loss: 0.0441


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.00s/it]



Evaluation Results:
Micro F1: 0.8723
Macro F1: 0.7336
New best model saved with Micro F1: 0.8723

Epoch 5/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0402]


Average training loss: 0.0382


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8713
Macro F1: 0.7419

Epoch 6/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0314]


Average training loss: 0.0339


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8735
Macro F1: 0.7459
New best model saved with Micro F1: 0.8735

Epoch 7/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0302]


Average training loss: 0.0305


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8717
Macro F1: 0.7555

Epoch 8/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0255]


Average training loss: 0.0274


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8701
Macro F1: 0.7582

Epoch 9/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0275]


Average training loss: 0.0246


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8697
Macro F1: 0.7506

Epoch 10/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0241]


Average training loss: 0.0223


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8697
Macro F1: 0.7446

Epoch 11/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0184]


Average training loss: 0.0202


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8642
Macro F1: 0.7497

Epoch 12/20


Training: 100%|██████████| 377/377 [16:27<00:00,  2.62s/it, loss=0.0200]


Average training loss: 0.0183


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8657
Macro F1: 0.7522

Epoch 13/20


Training: 100%|██████████| 377/377 [16:27<00:00,  2.62s/it, loss=0.0157]


Average training loss: 0.0165


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8608
Macro F1: 0.7479

Epoch 14/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0138]


Average training loss: 0.0150


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8639
Macro F1: 0.7531

Epoch 15/20


Training: 100%|██████████| 377/377 [16:27<00:00,  2.62s/it, loss=0.0120]


Average training loss: 0.0138


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8612
Macro F1: 0.7539

Epoch 16/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0089]


Average training loss: 0.0124


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8607
Macro F1: 0.7441

Epoch 17/20


Training: 100%|██████████| 377/377 [16:26<00:00,  2.62s/it, loss=0.0089]


Average training loss: 0.0114


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8616
Macro F1: 0.7490

Epoch 18/20


Training: 100%|██████████| 377/377 [16:27<00:00,  2.62s/it, loss=0.0104]


Average training loss: 0.0105


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8586
Macro F1: 0.7532

Epoch 19/20


Training: 100%|██████████| 377/377 [16:28<00:00,  2.62s/it, loss=0.0069]


Average training loss: 0.0097


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]



Evaluation Results:
Micro F1: 0.8642
Macro F1: 0.7482

Epoch 20/20


Training: 100%|██████████| 377/377 [16:27<00:00,  2.62s/it, loss=0.0064]


Average training loss: 0.0091


Evaluating: 100%|██████████| 50/50 [00:50<00:00,  1.01s/it]


Evaluation Results:
Micro F1: 0.8611
Macro F1: 0.7469





# Evaluating

In [9]:
import random
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader
import random
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score
import pandas as pd


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [11]:
model = BertForMultilabelClassification.from_pretrained('/kaggle/input/test/transformers/default/1').to(device)
model.eval()

BertForMultilabelClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [12]:
test_samples = load_and_process_data("/kaggle/working/test_dials.json")
test_dataset = DialogueActDataset(test_samples, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=128)

In [13]:
def evaluate_test_set(model, test_dataloader, device, threshold=0.5):
    model.eval()
    all_true_labels = []
    all_predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating Test Set"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']
            
            logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = (logits >= threshold).cpu().numpy().astype(int)
            all_true_labels.extend(labels.numpy())
            all_predicted_labels.extend(predictions)
    
    all_true_labels = np.array(all_true_labels)
    all_predicted_labels = np.array(all_predicted_labels)
    accuracy = accuracy_score(all_true_labels.flatten(), all_predicted_labels.flatten())
    f1_micro = f1_score(all_true_labels, all_predicted_labels, average='micro')
    f1_macro = f1_score(all_true_labels, all_predicted_labels, average='macro')
    
    label_names = test_dataset.label_binarizer.classes_
    report = classification_report(
        all_true_labels, 
        all_predicted_labels,
        target_names=label_names,
        zero_division=0
    )
    
    return {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'classification_report': report
    }


In [15]:
eval_results = evaluate_test_set(model, test_dataloader, device)

Evaluating Test Set: 100%|██████████| 50/50 [00:44<00:00,  1.13it/s]


In [16]:
print("\nTest Set Metrics:")
print(f"Accuracy: {eval_results['accuracy']:.4f}")
print(f"Micro F1: {eval_results['f1_micro']:.4f}")
print(f"Macro F1: {eval_results['f1_macro']:.4f}")

print("\nDetailed Classification Report:")
print(eval_results['classification_report'])


Test Set Metrics:
Accuracy: 0.9879
Micro F1: 0.8752
Macro F1: 0.7707

Detailed Classification Report:
                      precision    recall  f1-score   support

   Attraction-Inform       0.91      0.91      0.91       833
  Attraction-NoOffer       0.77      0.84      0.80        56
Attraction-Recommend       0.77      0.71      0.74       147
  Attraction-Request       0.79      0.78      0.78       159
   Attraction-Select       0.70      0.58      0.63        55
        Booking-Book       0.89      0.95      0.92       526
      Booking-Inform       0.93      0.91      0.92       558
      Booking-NoBook       0.96      0.97      0.97       131
     Booking-Request       0.95      0.94      0.94       321
        Hotel-Inform       0.86      0.90      0.88       809
       Hotel-NoOffer       0.86      0.75      0.80        65
     Hotel-Recommend       0.68      0.68      0.68       138
       Hotel-Request       0.83      0.84      0.84       280
        Hotel-Select       0

In [None]:
def predict_random_samples(model, test_dataset, device, n_samples=10, threshold=0.5):
    random_indices = random.sample(range(len(test_dataset)), n_samples)
    
    selected_texts = [test_dataset.texts[i] for i in random_indices]
    true_labels = [test_dataset.acts[i] for i in random_indices]
    
    inputs = test_dataset.tokenizer(
        selected_texts,
        add_special_tokens=True,
        max_length=test_dataset.max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    model.eval()
    with torch.no_grad():
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = (logits >= threshold).cpu().numpy().astype(int)
        
    predicted_labels = test_dataset.label_binarizer.inverse_transform(predictions)
    
    results = []
    for i in range(n_samples):
        result = (
            f"Text: {selected_texts[i]}\n"
            f"True Labels: {', '.join(true_labels[i])}\n"
            f"Predicted Labels: {', '.join(predicted_labels[i])}"
        )
        results.append(result)
    
    return results

In [17]:
random_predictions = predict_random_samples(model, test_dataset, device)

for i, prediction in enumerate(random_predictions, start=1):
    print(f"Sample {i}:\n{prediction}\n{'-'*50}")

Sample 1:
Text: booking was successful . the table will be reserved for 15 minutes . reference number is : f29amxp7 .
True Labels: Booking-Book
Predicted Labels: Booking-Book
--------------------------------------------------
Sample 2:
Text: the first train to leave stevenage after 09:45 leaves at 09:54 and costs 12.80 pounds . would you like me to book that for you ?
True Labels: Train-Inform, Train-OfferBook
Predicted Labels: Train-Inform, Train-OfferBook
--------------------------------------------------
Sample 3:
Text: may i book you taxi or anything else ?
True Labels: general-reqmore
Predicted Labels: general-reqmore
--------------------------------------------------
Sample 4:
Text: call them at 01223363682
True Labels: Hotel-Inform
Predicted Labels: Hotel-Inform
--------------------------------------------------
Sample 5:
Text: all right , a red skoda will pick you up . the contact number is 07278222346 .
True Labels: Taxi-Inform
Predicted Labels: Taxi-Inform
-------------------