In [5]:
from datasets import load_dataset

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

Downloading and preparing dataset multi_woz_v22/v2.2_active_only (download: 263.78 MiB, generated: 49.33 MiB, post-processed: Unknown size, total: 313.11 MiB) to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a...
                

Downloading data files #3:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data:   0%|          | 0.00/439k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/22 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset multi_woz_v22 downloaded and prepared to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [None]:
print(train_data_filtered[0])

# Identifying the slots

In [39]:
def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)

        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                    # Label tokens using IOB format with the actual ground truth slot value
                    labels[start_token_idx] = f"B-{act_slot_name}"
                    for j in range(start_token_idx + 1, end_token_idx + 1):
                        labels[j] = f"I-{act_slot_name}"
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data


In [40]:
import pandas as pd
import numpy as np

def toDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_utterances(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)



In [41]:
print(train_df.shape)
print(train_df["Tokens"].iloc[12])
print(train_df["Labels"].iloc[12])

(28928, 2)
['Guten', 'Tag,', 'I', 'am', 'staying', 'overnight', 'in', 'Cambridge', 'and', 'need', 'a', 'place', 'to', 'sleep.', 'I', 'need', 'free', 'parking', 'and', 'internet.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [42]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [43]:

all_labels = [label for sublist in train_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in val_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in test_df['Labels'].tolist() for label in sublist]
unique_labels = sorted(set(all_labels))

unique_labels.__sizeof__()

392

In [44]:
label_map = {label: i for i, label in enumerate(unique_labels)}

In [45]:
# Tokenize the  data

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_data(tokenizer, text_sequences, label_sequences, label_map):
    tokenized_texts, tokenized_labels = [], []
    
    for tokens, labels in zip(text_sequences, label_sequences):
        tokenized_text, tokenized_label = [], []
        
        for word, label in zip(tokens, labels):
            # Tokenize each word and its corresponding label
            word_tokens = tokenizer.tokenize(word)
            tokenized_text.extend(word_tokens)
            tokenized_label.extend([label_map[label]] * len(word_tokens))  # Convert label to integer
        
        tokenized_texts.append(tokenizer.convert_tokens_to_ids(tokenized_text))
        tokenized_labels.append(tokenized_label)
    
    return tokenized_texts, tokenized_labels

# Tokenize the data
train_texts, train_labels = tokenize_data(tokenizer, train_df['Tokens'].tolist(), train_df['Labels'].tolist(), label_map)
val_texts, val_labels = tokenize_data(tokenizer, val_df['Tokens'].tolist(), val_df['Labels'].tolist(), label_map)
test_texts, test_labels = tokenize_data(tokenizer, test_df['Tokens'].tolist(), test_df['Labels'].tolist(), label_map)


In [46]:
from torch.nn.utils.rnn import pad_sequence

def createDataLoader(tokenized_texts, tokenized_labels ):
    # Convert lists to PyTorch tensors and pad
    tokenized_texts = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_texts], padding_value=0, batch_first=True)
    tokenized_labels = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_labels], padding_value=-100, batch_first=True)
    
    # Create attention masks
    attention_masks = (tokenized_texts != 0).long()
    
    # Create a TensorDataset
    train_dataset = TensorDataset(tokenized_texts, attention_masks, tokenized_labels)
    
    # Create DataLoader    
    return DataLoader(train_dataset, batch_size=16, shuffle=True)
    

In [47]:
train_dataloader = createDataLoader(train_texts, train_labels)
val_dataloader = createDataLoader(val_texts, val_labels)
test_dataloader = createDataLoader(test_texts, test_labels)

In [48]:
!pip install pytorch-crf



In [49]:
from transformers import BertForTokenClassification, BertConfig

# Define the number of labels
num_labels = len(label_map)  # Make sure label_map is defined in your environment

# Create a configuration object with `num_labels` set
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Create the model with the standard token classification head
model = BertForTokenClassification(config).to(device)


In [50]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
epochs = 30
patience = 2

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            val_loss += loss.item()
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Validation Loss: {avg_val_loss}')

    # Check if the validation loss is lower than the best one seen so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), f'checkpoint_epoch_{epoch+1}.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping!')
            break
        
torch.save(model.state_dict(), 'final_model.pt')
print('Training complete. Final model saved.')

Epoch 1/30 | Train Loss: 0.3078518355001522
Epoch 1/30 | Validation Loss: 0.20128920966616043
Epoch 2/30 | Train Loss: 0.15346546305561917
Epoch 2/30 | Validation Loss: 0.14335279355828578
Epoch 3/30 | Train Loss: 0.11943981812935141
Epoch 3/30 | Validation Loss: 0.15083306312847597
Epoch 4/30 | Train Loss: 0.10267668989637921
Epoch 4/30 | Validation Loss: 0.1330796833221729
Epoch 5/30 | Train Loss: 0.09109290149444055
Epoch 5/30 | Validation Loss: 0.11846111788629339
Epoch 6/30 | Train Loss: 0.08189989837075572
Epoch 6/30 | Validation Loss: 0.11852552244583002
Epoch 7/30 | Train Loss: 0.07310678646320258
Epoch 7/30 | Validation Loss: 0.11713243020125307
Epoch 8/30 | Train Loss: 0.06580413280124982
Epoch 8/30 | Validation Loss: 0.1258521921789417
Epoch 9/30 | Train Loss: 0.05879561370985574
Epoch 9/30 | Validation Loss: 0.12026349045336246
Early stopping!
Training complete. Final model saved.


In [51]:
!pip install seqeval
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

# Reverse the label map to translate from numeric to string labels
label_map_reverse = {v: k for k, v in label_map.items()}

model.eval()
total_loss = 0
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_masks, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Convert logits to token predictions
        predictions = np.argmax(logits, axis=-1)

        # For each item in the batch...
        for i in range(b_input_ids.size(0)):
            # ...collect those tokens that are not padding tokens
            true_label_sequence = [label_map_reverse[label_id] for j, label_id in enumerate(label_ids[i]) if b_attention_masks[i][j] != 0]
            pred_label_sequence = [label_map_reverse[pred_id] for j, pred_id in enumerate(predictions[i]) if b_attention_masks[i][j] != 0]

            # ...extend the true labels and predicted labels lists
            all_true_labels.append(true_label_sequence)
            all_predictions.append(pred_label_sequence)

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_dataloader)
print(f"Test loss: {avg_loss}")

# Use seqeval to compute a classification report
seqeval_report = seqeval_classification_report(all_true_labels, all_predictions)
print(seqeval_report)


Test loss: 0.11181684477386349
              precision    recall  f1-score   support

     address       0.66      0.87      0.75       113
        area       0.76      0.80      0.78       433
     bookday       0.84      0.97      0.90       313
  bookpeople       0.87      0.89      0.88       186
    bookstay       0.79      0.86      0.82       125
    booktime       0.95      0.97      0.96       372
      choice       0.88      0.89      0.89       226
        food       0.91      0.86      0.88       297
        name       0.66      0.76      0.70       644
       phone       0.96      0.96      0.96       365
    postcode       0.97      0.96      0.97       192
  pricerange       0.90      0.93      0.91       370
         ref       0.99      0.98      0.98       863
       stars       0.94      0.97      0.95       185
        type       0.73      0.88      0.80       358

   micro avg       0.85      0.90      0.88      5042
   macro avg       0.85      0.90      0.88      

In [52]:
def query_model(model, tokenizer, label_map, sentence, device):
    model.eval()  
    
    # Tokenize the new utterance using the tokenize_data function
    tokenized_texts, _ = tokenize_data(tokenizer, [sentence.split()], [['O']*len(sentence.split())], label_map) # Add dummy labels.
    
    # Convert to PyTorch tensors
    input_ids = torch.tensor(tokenized_texts).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = np.argmax(logits.detach().cpu().numpy(), axis=2)
    predicted_labels = [list(label_map.keys())[list(label_map.values()).index(p)] for p in predictions[0] if p != -100]

    return predicted_labels

new_utterance = "I would like to book a table for two at a Mexican restaurant."
predicted_labels = query_model(model, tokenizer, label_map, new_utterance, device)
print(predicted_labels)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-bookpeople', 'O', 'O', 'B-food', 'O', 'O']


# Mapping slots to values

In [57]:
def label_slots(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  

        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                    # Label tokens using IOB format with the actual ground truth slot value
                    slot = f"{utterance[span_start:span_end]}"
                    value = act_slot_value
                    
                    labeled_data.append((slot, value))
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")
                

        # Store the tokenized utterance along with its labels
        
        
    return labeled_data

In [58]:
import pandas as pd
def slotsToDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_slots(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Slots', 'Values'])
    
# Create DataFrames of labeled utterances
train_df = slotsToDF(train_data_filtered)
test_df = slotsToDF(test_data_filtered)
val_df = slotsToDF(val_data_filtered)



In [67]:
mismatched_rows = train_df[train_df['Slots'] != train_df['Values']]

# Display the filtered rows
display(mismatched_rows.to_string())

"                                     Slots                               Values\n0                                   center                               centre\n25                              don't care                             dontcare\n26                              don't care                             dontcare\n33                               Cityroomz                            cityroomz\n53                                 Chinese                              chinese\n57                                Saturday                             saturday\n73                                 Italian                              italian\n81                                 Chinese                              chinese\n83                                    East                                 east\n86                                  Monday                               monday\n95                                  Bistro                               bistro\n110                        