In [4]:
import numpy as np
import argparse
from shutil import copyfile
from create_data import createData
from create_data import divideData

In [5]:
print('Create WOZ-like dialogues. Get yourself a coffee, this might take a while.')
args = argparse.Namespace(
    main_dir="/kaggle/input/multiwoz2-4",
    mwz_ver="2.4",
    target_path="/kaggle/working/"
)

delex_data = createData(args)
print('Divide dialogues...')
divideData(delex_data,args)

Create WOZ-like dialogues. Get yourself a coffee, this might take a while.
Downloading and unzipping the MultiWOZ 2.4 dataset
Divide dialogues...
# of dialogues: Train 8420, Val 1000, Test 999


In [4]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def add_dialogue_acts(dialogue_acts, data_files):
    for file_path in data_files:
        data = load_json(file_path)
        
        for dialogue in data:
            dialogue_idx = dialogue['dialogue_idx'].replace('.json', '')

            if dialogue_idx in dialogue_acts:
                acts = dialogue_acts[dialogue_idx]

                for turn in dialogue['dialogue']:
                    turn_idx = str(turn['turn_idx'])
                    if turn_idx in acts:
                        turn['dialogue_act'] = acts[turn_idx]

        save_json(data, file_path)



In [5]:
dialogue_acts_path = '/kaggle/input/multiwoz2-4/dialogue_acts.json'
data_files = ['/kaggle/working/train_dials.json', '/kaggle/working/dev_dials.json', '/kaggle/working/test_dials.json']

dialogue_acts = load_json(dialogue_acts_path)


add_dialogue_acts(dialogue_acts, data_files)

print("Dialogue acts have been added to all data files.")

Dialogue acts have been added to all data files.


##### Here the testing place

For better understanding or visualizing

In [6]:
import json

def download_first_10_from_json(filename):
    """
    Downloads the first 10 elements from a JSON file.

    Args:
        filename: The name of the JSON file.

    Returns:
        A list containing the first 10 elements of the JSON data.
    """

    with open(filename, 'r') as file:
        data = json.load(file)

    first_10_elements = data[:10]

    with open('first_10_elements.json', 'w') as output_file:
        json.dump(first_10_elements, output_file)

    return first_10_elements

filename = '/kaggle/working/dev_dials.json'
first_10_elements = download_first_10_from_json(filename)

In [7]:

file_path = "/kaggle/working/first_10_elements.json"

with open(file_path, 'r') as f:
    data = json.load(f)

samples = []
for dialogue in data:
    for turn in dialogue['dialogue']:
        x = turn['system_transcript']
        y = turn.get('dialogue_act', None)  
        if y:  
            samples.append((x, y))

print(f"Extracted {len(samples)} (x, y) pairs.")
for i, (x, y) in enumerate(samples[:5]):
    print(f"Sample {i+1}:")
    print(f"  x: {x}")
    print(f"  y: {y}")

Extracted 66 (x, y) pairs.
Sample 1:
  x: i can help you with that . what is your price range ?
  y: {'Hotel-Request': [['Price', '?']]}
Sample 2:
  x: if you would like something cheap , i recommend the allenbell . for something moderate -ly priced , i would recommend the warkworth house .
  y: {'Hotel-Recommend': [['Price', 'cheap'], ['Price', ' moderately priced'], ['Name', 'Allenbell'], ['Name', ' Warkworth House']]}
Sample 3:
  x: what day will you be staying ?
  y: {'Booking-Request': [['Day', '?']]}
Sample 4:
  x: booking was successful . reference number is : bmukptg6 . can i help you with anything else today ?
  y: {'general-reqmore': [['none', 'none']], 'Booking-Book': [['Ref', 'BMUKPTG6']]}
Sample 5:
  x: there are a number of trains leaving throughout the day . what time would you like to travel ?
  y: {'Train-Inform': [['Choice', 'a number'], ['Leave', 'throughout the day']], 'Train-Request': [['Leave', '?']]}


# Dialogue Act

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import json
import numpy as np
from tqdm import tqdm

In [None]:
class DialogueActDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128):
        self.texts = [x for x, _ in samples]
        
        
        self.acts = []
        for _, y in samples:
            # handle both dictionary and string label incase of missing data
            if isinstance(y, dict):
                self.acts.append(list(y.keys())[0])
            elif isinstance(y, str):
                self.acts.append(y)
            else:
                raise ValueError(f"Unexpected label type: {type(y)}. Expected dict or str.")
        
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.acts)
        
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def load_and_process_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    samples = []
    for dialogue in data:
        for turn in dialogue['dialogue']:
            x = turn['system_transcript']
            y = turn.get('dialogue_act', None)
            if y:
                samples.append((x, y))
    return samples

In [None]:
def evaluate_model(model, eval_dataloader, device):
    model.eval()
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())
    
    return true_labels, predicted_labels

In [None]:
def predict_new_texts(texts, model, tokenizer, label_encoder, device):
    model.eval()
    predictions = []
    
    # tokenize
    inputs = tokenizer(
        texts,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        
    
    predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy()) # Convert numeric predictions back to label
    
    return predicted_labels

In [7]:
def train_model(train_dataloader, eval_dataloader, model, device, num_epochs=20):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    best_accuracy = 0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc="Training")
        for batch in progress_bar:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        # Evaluation
        true_labels, predicted_labels = evaluate_model(model, eval_dataloader, device)
        accuracy = accuracy_score(true_labels, predicted_labels)
        print("\nEvaluation Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(true_labels, predicted_labels))
        
        # saving
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            model.save_pretrained('./model')
            print(f"New best model saved with accuracy: {accuracy:.4f}")

# Training

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_samples = load_and_process_data("train_dials.json")
eval_samples = load_and_process_data("dev_dials.json")  

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = DialogueActDataset(train_samples, tokenizer)
eval_dataset = DialogueActDataset(eval_samples, tokenizer)


print(f"\nTraining samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")
print("\nUnique dialogue acts:", train_dataset.label_encoder.classes_)
print(f"Number of unique labels: {len(train_dataset.label_encoder.classes_)}")

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Training samples: 48241
Evaluation samples: 6371

Unique dialogue acts: ['Attraction-Inform' 'Attraction-NoOffer' 'Attraction-Recommend'
 'Attraction-Request' 'Attraction-Select' 'Booking-Book' 'Booking-Inform'
 'Booking-NoBook' 'Booking-Request' 'Hotel-Inform' 'Hotel-NoOffer'
 'Hotel-Recommend' 'Hotel-Request' 'Hotel-Select' 'No Annotation'
 'Restaurant-Inform' 'Restaurant-NoOffer' 'Restaurant-Recommend'
 'Restaurant-Request' 'Restaurant-Select' 'Taxi-Inform' 'Taxi-Request'
 'Train-Inform' 'Train-NoOffer' 'Train-OfferBook' 'Train-OfferBooked'
 'Train-Request' 'Train-Select' 'general-bye' 'general-greet'
 'general-reqmore' 'general-welcome']
Number of unique labels: 32




In [10]:
num_labels = len(train_dataset.label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
).to(device)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=128)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_model(train_dataloader, eval_dataloader, model, device)


Epoch 1/20


Training: 100%|██████████| 377/377 [16:03<00:00,  2.56s/it, loss=0.7317]


Average training loss: 1.5551


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluation Results:
Accuracy: 0.8060

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76       325
           1       1.00      0.08      0.15        24
           2       0.65      0.82      0.73       154
           3       0.68      0.77      0.72       167
           4       0.00      0.00      0.00        42
           5       0.82      0.86      0.84       227
           6       0.83      0.92      0.87       487
           7       0.77      0.97      0.86       110
           8       0.80      0.88      0.84       163
           9       0.77      0.72      0.74       353
          10       0.58      0.60      0.59        50
          11       0.64      0.55      0.59       125
          12       0.76      0.93      0.84       284
          13       0.71      0.46      0.56        54
          14       0.18      0.03      0.04        80
          15       0.66      0.62      0.64       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.8202]


Average training loss: 0.6770


Evaluating: 100%|██████████| 50/50 [00:47<00:00,  1.04it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluation Results:
Accuracy: 0.8269

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       325
           1       0.80      0.50      0.62        24
           2       0.71      0.88      0.79       154
           3       0.76      0.85      0.80       167
           4       0.71      0.48      0.57        42
           5       0.85      0.85      0.85       227
           6       0.83      0.93      0.87       487
           7       0.87      0.96      0.91       110
           8       0.85      0.85      0.85       163
           9       0.82      0.78      0.80       353
          10       0.53      0.82      0.64        50
          11       0.70      0.60      0.65       125
          12       0.82      0.85      0.83       284
          13       0.50      0.59      0.54        54
          14       0.12      0.03      0.04        80
          15       0.79      0.62      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.4871]


Average training loss: 0.5453


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluation Results:
Accuracy: 0.8259

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       325
           1       0.67      0.50      0.57        24
           2       0.78      0.81      0.80       154
           3       0.74      0.86      0.79       167
           4       0.71      0.48      0.57        42
           5       0.88      0.79      0.83       227
           6       0.88      0.90      0.89       487
           7       0.91      0.96      0.94       110
           8       0.78      0.91      0.84       163
           9       0.80      0.85      0.82       353
          10       0.65      0.72      0.69        50
          11       0.82      0.61      0.70       125
          12       0.84      0.75      0.79       284
          13       0.55      0.67      0.61        54
          14       0.15      0.04      0.06        80
          15       0.77      0.66      0.71       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.3364]


Average training loss: 0.4701


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluation Results:
Accuracy: 0.8322

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.80       325
           1       0.50      0.58      0.54        24
           2       0.76      0.79      0.78       154
           3       0.78      0.84      0.81       167
           4       0.70      0.62      0.66        42
           5       0.87      0.80      0.83       227
           6       0.88      0.90      0.89       487
           7       0.92      0.97      0.95       110
           8       0.83      0.87      0.85       163
           9       0.81      0.79      0.80       353
          10       0.62      0.74      0.67        50
          11       0.76      0.60      0.67       125
          12       0.83      0.86      0.85       284
          13       0.48      0.65      0.55        54
          14       0.21      0.05      0.08        80
          15       0.74      0.66      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.5494]


Average training loss: 0.4096


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8281

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       325
           1       0.59      0.54      0.57        24
           2       0.80      0.72      0.76       154
           3       0.75      0.86      0.80       167
           4       0.61      0.52      0.56        42
           5       0.86      0.82      0.84       227
           6       0.87      0.90      0.89       487
           7       0.93      0.97      0.95       110
           8       0.80      0.91      0.85       163
           9       0.78      0.85      0.81       353
          10       0.59      0.70      0.64        50
          11       0.82      0.58      0.68       125
          12       0.82      0.83      0.82       284
          13       0.52      0.69      0.59        54
          14       0.21      0.05      0.08        80
          15       0.68      0.71      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.3657]


Average training loss: 0.3544


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8300

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.73      0.80       325
           1       0.59      0.54      0.57        24
           2       0.68      0.87      0.76       154
           3       0.76      0.83      0.80       167
           4       0.66      0.50      0.57        42
           5       0.85      0.82      0.84       227
           6       0.84      0.93      0.88       487
           7       0.91      0.95      0.93       110
           8       0.84      0.82      0.83       163
           9       0.82      0.78      0.80       353
          10       0.67      0.66      0.67        50
          11       0.66      0.65      0.66       125
          12       0.81      0.89      0.85       284
          13       0.47      0.65      0.54        54
          14       0.24      0.09      0.13        80
          15       0.74      0.69      0.72       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.3914]


Average training loss: 0.3065


Evaluating: 100%|██████████| 50/50 [00:47<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8203

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       325
           1       0.56      0.58      0.57        24
           2       0.70      0.84      0.77       154
           3       0.81      0.79      0.80       167
           4       0.58      0.52      0.55        42
           5       0.84      0.81      0.83       227
           6       0.88      0.85      0.86       487
           7       0.90      0.97      0.93       110
           8       0.87      0.81      0.84       163
           9       0.78      0.80      0.79       353
          10       0.58      0.64      0.61        50
          11       0.58      0.67      0.62       125
          12       0.81      0.83      0.82       284
          13       0.52      0.63      0.57        54
          14       0.26      0.06      0.10        80
          15       0.75      0.63      0.68       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.3056]


Average training loss: 0.2580


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8146

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       325
           1       0.46      0.50      0.48        24
           2       0.71      0.84      0.77       154
           3       0.76      0.82      0.79       167
           4       0.62      0.57      0.59        42
           5       0.85      0.73      0.79       227
           6       0.83      0.90      0.87       487
           7       0.92      0.95      0.93       110
           8       0.88      0.78      0.83       163
           9       0.78      0.82      0.80       353
          10       0.58      0.68      0.62        50
          11       0.67      0.62      0.64       125
          12       0.83      0.77      0.80       284
          13       0.55      0.59      0.57        54
          14       0.16      0.07      0.10        80
          15       0.72      0.68      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1498]


Average training loss: 0.2227


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8137

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       325
           1       0.57      0.54      0.55        24
           2       0.68      0.85      0.75       154
           3       0.71      0.86      0.78       167
           4       0.67      0.52      0.59        42
           5       0.88      0.77      0.82       227
           6       0.88      0.84      0.86       487
           7       0.91      0.95      0.93       110
           8       0.83      0.82      0.83       163
           9       0.82      0.80      0.81       353
          10       0.58      0.64      0.61        50
          11       0.60      0.67      0.63       125
          12       0.81      0.80      0.81       284
          13       0.50      0.63      0.56        54
          14       0.13      0.05      0.07        80
          15       0.71      0.67      0.69       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.3247]


Average training loss: 0.1894


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8076

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       325
           1       0.56      0.58      0.57        24
           2       0.72      0.80      0.76       154
           3       0.72      0.81      0.76       167
           4       0.53      0.24      0.33        42
           5       0.87      0.74      0.80       227
           6       0.86      0.82      0.84       487
           7       0.92      0.92      0.92       110
           8       0.81      0.83      0.82       163
           9       0.77      0.84      0.80       353
          10       0.57      0.66      0.61        50
          11       0.59      0.67      0.63       125
          12       0.81      0.75      0.78       284
          13       0.50      0.61      0.55        54
          14       0.13      0.06      0.08        80
          15       0.68      0.75      0.71       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1817]


Average training loss: 0.1651


Evaluating: 100%|██████████| 50/50 [00:47<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8143

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.72      0.80       325
           1       0.48      0.58      0.53        24
           2       0.72      0.78      0.75       154
           3       0.81      0.75      0.78       167
           4       0.57      0.62      0.59        42
           5       0.85      0.80      0.82       227
           6       0.86      0.88      0.87       487
           7       0.89      0.96      0.93       110
           8       0.84      0.82      0.83       163
           9       0.81      0.79      0.80       353
          10       0.59      0.70      0.64        50
          11       0.68      0.66      0.67       125
          12       0.80      0.80      0.80       284
          13       0.47      0.65      0.54        54
          14       0.24      0.07      0.11        80
          15       0.77      0.67      0.72       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1892]


Average training loss: 0.1399


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8157

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.74      0.79       325
           1       0.46      0.50      0.48        24
           2       0.66      0.87      0.75       154
           3       0.74      0.84      0.78       167
           4       0.68      0.36      0.47        42
           5       0.87      0.80      0.83       227
           6       0.86      0.85      0.86       487
           7       0.89      0.92      0.90       110
           8       0.79      0.86      0.82       163
           9       0.79      0.79      0.79       353
          10       0.64      0.58      0.61        50
          11       0.54      0.70      0.61       125
          12       0.81      0.83      0.82       284
          13       0.52      0.69      0.59        54
          14       0.30      0.07      0.12        80
          15       0.70      0.75      0.73       189
          16       

Training: 100%|██████████| 377/377 [16:09<00:00,  2.57s/it, loss=0.1442]


Average training loss: 0.1234


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8094

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80       325
           1       0.54      0.54      0.54        24
           2       0.80      0.72      0.76       154
           3       0.77      0.86      0.81       167
           4       0.65      0.48      0.55        42
           5       0.79      0.78      0.78       227
           6       0.88      0.82      0.85       487
           7       0.91      0.95      0.93       110
           8       0.85      0.85      0.85       163
           9       0.78      0.85      0.82       353
          10       0.63      0.64      0.63        50
          11       0.65      0.66      0.65       125
          12       0.78      0.87      0.82       284
          13       0.49      0.59      0.54        54
          14       0.15      0.07      0.10        80
          15       0.69      0.71      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1151]


Average training loss: 0.1117


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8112

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80       325
           1       0.44      0.50      0.47        24
           2       0.76      0.72      0.74       154
           3       0.71      0.85      0.77       167
           4       0.60      0.36      0.45        42
           5       0.84      0.76      0.80       227
           6       0.86      0.86      0.86       487
           7       0.90      0.96      0.93       110
           8       0.83      0.83      0.83       163
           9       0.77      0.82      0.79       353
          10       0.61      0.66      0.63        50
          11       0.61      0.66      0.63       125
          12       0.79      0.84      0.81       284
          13       0.48      0.52      0.50        54
          14       0.18      0.07      0.11        80
          15       0.73      0.69      0.71       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1445]


Average training loss: 0.1025


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8131

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       325
           1       0.50      0.54      0.52        24
           2       0.75      0.80      0.77       154
           3       0.75      0.85      0.80       167
           4       0.54      0.48      0.51        42
           5       0.82      0.83      0.83       227
           6       0.84      0.88      0.86       487
           7       0.88      0.96      0.92       110
           8       0.87      0.81      0.84       163
           9       0.80      0.79      0.80       353
          10       0.58      0.62      0.60        50
          11       0.62      0.67      0.64       125
          12       0.80      0.77      0.79       284
          13       0.53      0.61      0.57        54
          14       0.20      0.06      0.10        80
          15       0.69      0.74      0.71       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.1381]


Average training loss: 0.0938


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8109

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       325
           1       0.48      0.58      0.53        24
           2       0.79      0.71      0.75       154
           3       0.78      0.85      0.81       167
           4       0.50      0.48      0.49        42
           5       0.79      0.81      0.80       227
           6       0.84      0.90      0.87       487
           7       0.90      0.95      0.92       110
           8       0.85      0.81      0.83       163
           9       0.79      0.81      0.80       353
          10       0.62      0.70      0.66        50
          11       0.68      0.64      0.66       125
          12       0.82      0.75      0.79       284
          13       0.49      0.63      0.55        54
          14       0.17      0.06      0.09        80
          15       0.67      0.70      0.68       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.0385]


Average training loss: 0.0870


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8093

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.75      0.79       325
           1       0.50      0.62      0.56        24
           2       0.69      0.80      0.74       154
           3       0.78      0.80      0.79       167
           4       0.51      0.55      0.53        42
           5       0.79      0.80      0.79       227
           6       0.85      0.87      0.86       487
           7       0.91      0.96      0.94       110
           8       0.85      0.80      0.82       163
           9       0.82      0.80      0.81       353
          10       0.62      0.58      0.60        50
          11       0.61      0.70      0.65       125
          12       0.81      0.80      0.81       284
          13       0.55      0.59      0.57        54
          14       0.20      0.07      0.11        80
          15       0.72      0.70      0.71       189
          16       

Training: 100%|██████████| 377/377 [16:07<00:00,  2.57s/it, loss=0.0885]


Average training loss: 0.0796


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8076

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       325
           1       0.50      0.50      0.50        24
           2       0.67      0.81      0.73       154
           3       0.74      0.82      0.78       167
           4       0.55      0.43      0.48        42
           5       0.83      0.81      0.82       227
           6       0.87      0.86      0.86       487
           7       0.89      0.96      0.93       110
           8       0.83      0.83      0.83       163
           9       0.78      0.85      0.81       353
          10       0.57      0.66      0.61        50
          11       0.66      0.59      0.62       125
          12       0.78      0.81      0.80       284
          13       0.51      0.63      0.56        54
          14       0.14      0.05      0.07        80
          15       0.70      0.70      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.0361]


Average training loss: 0.0757


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]



Evaluation Results:
Accuracy: 0.8120

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.80       325
           1       0.54      0.62      0.58        24
           2       0.69      0.82      0.75       154
           3       0.75      0.82      0.79       167
           4       0.65      0.48      0.55        42
           5       0.85      0.74      0.80       227
           6       0.87      0.87      0.87       487
           7       0.90      0.95      0.93       110
           8       0.86      0.84      0.85       163
           9       0.78      0.83      0.80       353
          10       0.60      0.64      0.62        50
          11       0.66      0.62      0.64       125
          12       0.78      0.81      0.80       284
          13       0.54      0.46      0.50        54
          14       0.18      0.09      0.12        80
          15       0.69      0.70      0.70       189
          16       

Training: 100%|██████████| 377/377 [16:08<00:00,  2.57s/it, loss=0.0944]


Average training loss: 0.0768


Evaluating: 100%|██████████| 50/50 [00:48<00:00,  1.04it/s]


Evaluation Results:
Accuracy: 0.8142

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.73      0.79       325
           1       0.55      0.46      0.50        24
           2       0.70      0.81      0.75       154
           3       0.80      0.81      0.81       167
           4       0.50      0.60      0.54        42
           5       0.82      0.81      0.82       227
           6       0.85      0.87      0.86       487
           7       0.90      0.96      0.93       110
           8       0.85      0.87      0.86       163
           9       0.79      0.79      0.79       353
          10       0.61      0.68      0.64        50
          11       0.64      0.67      0.65       125
          12       0.78      0.85      0.81       284
          13       0.49      0.61      0.55        54
          14       0.15      0.07      0.10        80
          15       0.71      0.71      0.71       189
          16       




# Evaluating

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [9]:
test_samples = load_and_process_data("/kaggle/working/test_dials.json")
print(f"\nTest samples: {len(test_samples)}")


Test samples: 6369


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

test_dataset = DialogueActDataset(test_samples, tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [11]:
model = BertForSequenceClassification.from_pretrained('/kaggle/input/dialogueactdetection/transformers/default/1').to(device)
label_encoder = test_dataset.label_encoder

In [12]:
test_dataloader = DataLoader(test_dataset, batch_size=512)
true_labels, predicted_labels = evaluate_model(model, test_dataloader, device)

print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

Evaluating: 100%|██████████| 13/13 [00:42<00:00,  3.28s/it]


Classification Report:
                      precision    recall  f1-score   support

   Attraction-Inform       0.81      0.82      0.81       342
  Attraction-NoOffer       0.71      0.76      0.74        33
Attraction-Recommend       0.75      0.74      0.74       143
  Attraction-Request       0.81      0.84      0.82       159
   Attraction-Select       0.65      0.66      0.65        50
        Booking-Book       0.82      0.86      0.84       216
      Booking-Inform       0.86      0.88      0.87       482
      Booking-NoBook       0.96      0.95      0.96       129
     Booking-Request       0.84      0.90      0.87       175
        Hotel-Inform       0.81      0.81      0.81       338
       Hotel-NoOffer       0.74      0.84      0.79        44
     Hotel-Recommend       0.78      0.63      0.70       138
       Hotel-Request       0.79      0.87      0.83       271
        Hotel-Select       0.67      0.77      0.72        66
       No Annotation       0.48      0.16    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')  

print("\nAccuracy:", accuracy)
print("F1 Score (Weighted):", f1)


Accuracy: 0.833254828073481
F1 Score (Weighted): 0.8237818733166952


In [15]:
import random
random_samples = random.sample(test_samples, 10)
random_texts = [x for x, _ in random_samples]
random_real_acts = [y for _, y in random_samples]

predicted_acts = predict_new_texts(random_texts, model, tokenizer, label_encoder, device)

In [16]:
for i, (text, real_act, pred_act) in enumerate(zip(random_texts, random_real_acts, predicted_acts)):
    print(f"\nUtterance {i + 1}: {text}")
    print(f"Real act: {real_act}")
    print(f"Predicted act: {pred_act}")


Utterance 1: i am so sorry about any confusion . i have asian oriental food at yippee noodle bar , and british food such as the oak bistro .
Real act: {'Restaurant-Inform': [['Name', 'Yippee Noodle Bar'], ['Name', ' The Oak Bistro'], ['Food', 'asian oriental'], ['Food', ' British']]}
Predicted act: Restaurant-Inform

Utterance 2: here is your booking information:booking was successful . the table will be reserved for 15 minutes . reference number is : b162fv8d .
Real act: {'Booking-Book': [['Ref', 'B162FV8D']]}
Predicted act: Booking-Book

Utterance 3: we have 2 option for you . we have the ashley hotel and the lovell hotel . both have your commendations you want .
Real act: {'Hotel-Inform': [['Name', 'ashley hotel'], ['Name', ' the lovell hotel'], ['Choice', 'two'], ['Choice', ' Both']]}
Predicted act: Hotel-Inform

Utterance 4: there are 7 museums , i would recommend cambridge book and print gallery .
Real act: {'Attraction-Recommend': [['Name', 'cambridge book and print gallery']],