In [13]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [15]:
print(train_data_filtered[0])

{'dialogue_id': 'PMUL4398.json', 'services': ['restaurant', 'hotel'], 'turns': {'turn_id': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'speaker': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], 'utterance': ['i need a place to dine in the center thats expensive', 'I have several options for you; do you prefer African, Asian, or British food?', 'Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?', 'There is an Afrian place named Bedouin in the centre. How does that sound?', 'Sounds good, could I get that phone number? Also, could you recommend me an expensive hotel?', "Bedouin's phone is 01223367660. As far as hotels go, I recommend the University Arms Hotel in the center of town.", 'Yes. Can you book it for me?', 'Sure, when would you like that reservation?', 'i want to book it for 2 people and 2 nights starting from saturday.', 'Your booking was successful. Your reference number is FRGZWQL2 . May I help you

# Identifying the slots

In [16]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        # Tokenize the utterance and get the offset mappings
        encoded_input = tokenizer(utterance, add_special_tokens=False, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)
        offset_mapping = encoded_input['offset_mapping']
        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                # start_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[0] == span_start), None)
                # end_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[1] == span_end), None)
                
                # Utilize the offset_mapping to find the token index for the start and end of the span
                start_token_idx = None
                end_token_idx = None
                
                for idx, offset in enumerate(offset_mapping):
                    if start_token_idx is None and offset[0] == span_start:
                        start_token_idx = idx
                    if offset[1] == span_end:
                        end_token_idx = idx
                        break
                
                if start_token_idx is not None and end_token_idx is not None:
                    if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                        # Label tokens using IOB format with the actual ground truth slot value
                        labels[start_token_idx] = f"B-{act_slot_name}"
                        for j in range(start_token_idx + 1, end_token_idx + 1):
                            labels[j] = f"I-{act_slot_name}"
                    else:
                        print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data


In [17]:
import pandas as pd
import numpy as np

def toDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_utterances(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)

In [18]:
print(train_df.shape)
print(train_df["Tokens"].iloc[9])
print(train_df["Labels"].iloc[9])

(28928, 2)
['your', 'booking', 'was', 'successful', '.', 'your', 'reference', 'number', 'is', 'fr', '##g', '##z', '##w', '##q', '##l', '##2', '.', 'may', 'i', 'help', 'you', 'further', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ref', 'I-ref', 'I-ref', 'I-ref', 'I-ref', 'I-ref', 'I-ref', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [19]:

all_labels = [label for sublist in train_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in val_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in test_df['Labels'].tolist() for label in sublist]
unique_labels = sorted(set(all_labels))

unique_labels.__sizeof__()

print(unique_labels)

['B-address', 'B-area', 'B-arriveby', 'B-bookday', 'B-bookpeople', 'B-bookstay', 'B-booktime', 'B-choice', 'B-day', 'B-department', 'B-departure', 'B-destination', 'B-entrancefee', 'B-food', 'B-leaveat', 'B-name', 'B-openhours', 'B-phone', 'B-postcode', 'B-price', 'B-pricerange', 'B-ref', 'B-stars', 'B-type', 'I-address', 'I-area', 'I-arriveby', 'I-bookday', 'I-bookpeople', 'I-bookstay', 'I-booktime', 'I-choice', 'I-department', 'I-departure', 'I-destination', 'I-entrancefee', 'I-food', 'I-leaveat', 'I-name', 'I-openhours', 'I-phone', 'I-postcode', 'I-price', 'I-pricerange', 'I-ref', 'I-stars', 'I-type', 'O']


In [20]:
label_map = {label: i for i, label in enumerate(unique_labels)}

In [42]:
def create_dataset(df, tokenizer, label_map):
    # Lists to store the tokenized inputs and labels
    input_ids = []
    attention_masks = []
    label_ids = []

    # Iterate over the DataFrame rows
    for _, row in df.iterrows():
        tokens = row['Tokens']
        labels = row['Labels']
        encoded_input = tokenizer(
            tokens,
            is_split_into_words=True,
            add_special_tokens=True,
            return_attention_mask=True,
            padding='max_length',
            truncation=True,
            max_length=256  
        )
        
        # Convert the IOB labels to their corresponding IDs
        iob_label_ids = [label_map.get(label, label_map['O']) for label in labels]

        padding_length = len(encoded_input['input_ids']) - len(iob_label_ids)
        iob_label_ids.extend([-100] * padding_length)

        # Append the results to the lists
        input_ids.append(encoded_input['input_ids'])
        attention_masks.append(encoded_input['attention_mask'])
        label_ids.append(iob_label_ids)

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)
    label_ids = torch.tensor(label_ids, dtype=torch.long)

    # Create the TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, label_ids)

    return dataset


In [43]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [44]:
from transformers import BertForTokenClassification, BertConfig

# Define the number of labels
num_labels = len(label_map)  # Make sure label_map is defined in your environment

# Create a configuration object with `num_labels` set
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Create the model with the standard token classification head
model = BertForTokenClassification(config).to(device)


In [None]:
from tqdm.auto import tqdm

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
epochs = 30
patience = 0

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs} Training', leave=False)
    
    for batch in train_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_progress_bar.set_postfix(train_loss=loss.item())
    
    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    val_loss = 0
    val_progress_bar = tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{epochs} Validation', leave=False)
    for batch in val_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            val_loss += loss.item()
            val_progress_bar.set_postfix(val_loss=loss.item())
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Validation Loss: {avg_val_loss}')

    # Check if the validation loss is lower than the best one seen so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), f'checkpoint_epoch_{epoch+1}.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping!')
            break
        
torch.save(model.state_dict(), 'final_model.pt')
print('Training complete. Final model saved.')

Epoch 1/30 Training:   0%|          | 0/1808 [00:00<?, ?it/s]

In [41]:
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

# Reverse the label map to translate from numeric to string labels
label_map_reverse = {v: k for k, v in label_map.items()}

model.eval()
total_loss = 0
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_masks, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Convert logits to token predictions
        predictions = np.argmax(logits, axis=-1)
        
        print(b_attention_masks)

        # For each item in the batch...
        for i in range(b_input_ids.size(0)):
            pred_label_sequence = [label_map_reverse[pred_id] for j, pred_id in enumerate(predictions[i]) if b_attention_masks[i][j] != 0 and pred_id != -100]

            true_label_sequence = [label_map_reverse[label_id] for j, label_id in enumerate(label_ids[i]) if b_attention_masks[i][j] != 0 and label_id != -100]

            # Ensure the true and predicted sequences have the same length
            if len(true_label_sequence) != len(pred_label_sequence):
                print(f"Length mismatch in sequence {i}: true labels {len(true_label_sequence)} vs. predicted labels {len(pred_label_sequence)}")
                # Output the actual sequences to help diagnose the issue
                print("True labels:", true_label_sequence)
                print("Pred labels:", pred_label_sequence)
                # continue  # Skip appending these labels to avoid the ValueError in seqeval

            # ...extend the true labels and predicted labels lists
            all_true_labels.append(true_label_sequence)
            all_predictions.append(pred_label_sequence)

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_dataloader)
print(f"Test loss: {avg_loss}")

# Use seqeval to compute a classification report
seqeval_report = seqeval_classification_report(all_true_labels, all_predictions)
print(seqeval_report)


tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
Length mismatch in sequence 0: true labels 9 vs. predicted labels 11
True labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Length mismatch in sequence 1: true labels 31 vs. predicted labels 46
True labels: ['O', 'B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'O', 'O', 'B-food', 'I-food', 'O', 'O', 'O', 'O', 'B-area', 'I-area', 'I-area', 'O', 'O', 'B-pricerange', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred labels: ['B-name', 'I-name', 'I-name', 'I-name', 'I-name', 'I-name', 'O', 'O', 'B-food', 'I-food', 'I-food', 'I-food', 'I-food', 'O', 'O', 'O', 'O', 'O', 'I-area', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

KeyboardInterrupt: 

In [None]:
def query_model(model, tokenizer, label_map, utterance, device):
    model.eval()  
    
    # Tokenize the new utterance using the tokenize_data function
    tokenized_text = tokenizer(utterance, add_special_tokens=False, return_offsets_mapping=True)
    
    encoded_input = tokenizer(
    tokenized_text,
    is_split_into_words=True,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256  
    )
    
    input_ids = encoded_input['input_ids']
    attention_masks = encoded_input['attention_mask']
    
    # Convert to PyTorch tensors
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = np.argmax(logits.detach().cpu().numpy(), axis=2)
    predicted_labels = [list(label_map.keys())[list(label_map.values()).index(p)] for p in predictions[0] if p != -100]

    return predicted_labels

new_utterance = "I would like to book a table for two at a Mexican restaurant."
predicted_labels = query_model(model, tokenizer, label_map, new_utterance, device)
print(predicted_labels)