In [1]:
from datasets import load_dataset

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (C:/Users/camil/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [10]:
print(train_data_filtered[0])

{'dialogue_id': 'PMUL4398.json', 'services': ['restaurant', 'hotel'], 'turns': {'turn_id': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'speaker': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], 'utterance': ['i need a place to dine in the center thats expensive', 'I have several options for you; do you prefer African, Asian, or British food?', 'Any sort of food would be fine, as long as it is a bit expensive. Could I get the phone number for your recommendation?', 'There is an Afrian place named Bedouin in the centre. How does that sound?', 'Sounds good, could I get that phone number? Also, could you recommend me an expensive hotel?', "Bedouin's phone is 01223367660. As far as hotels go, I recommend the University Arms Hotel in the center of town.", 'Yes. Can you book it for me?', 'Sure, when would you like that reservation?', 'i want to book it for 2 people and 2 nights starting from saturday.', 'Your booking was successful. Your reference number is FRGZWQL2 . May I help you

In [11]:
def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)

        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                    # Label tokens using IOB format with the actual ground truth slot value
                    labels[start_token_idx] = f"B-{act_slot_name}:{act_slot_value}"
                    for j in range(start_token_idx + 1, end_token_idx + 1):
                        labels[j] = f"I-{act_slot_name}:{act_slot_value}"
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data


In [12]:
import pandas as pd
import numpy as np

def toDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_utterances(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)



In [13]:
print(train_df.shape)
print(train_df["Tokens"].iloc[12])
print(train_df["Labels"].iloc[12])

(28928, 2)
['Guten', 'Tag,', 'I', 'am', 'staying', 'overnight', 'in', 'Cambridge', 'and', 'need', 'a', 'place', 'to', 'sleep.', 'I', 'need', 'free', 'parking', 'and', 'internet.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:

all_labels = [label for sublist in train_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in val_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in test_df['Labels'].tolist() for label in sublist]
unique_labels = sorted(set(all_labels))

unique_labels.__sizeof__()

524488

In [8]:
label_map = {label: i for i, label in enumerate(unique_labels)}

In [9]:
# Tokenize the  data

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_data(tokenizer, text_sequences, label_sequences, label_map):
    tokenized_texts, tokenized_labels = [], []
    
    for tokens, labels in zip(text_sequences, label_sequences):
        tokenized_text, tokenized_label = [], []
        
        for word, label in zip(tokens, labels):
            # Tokenize each word and its corresponding label
            word_tokens = tokenizer.tokenize(word)
            tokenized_text.extend(word_tokens)
            tokenized_label.extend([label_map[label]] * len(word_tokens))  # Convert label to integer
        
        tokenized_texts.append(tokenizer.convert_tokens_to_ids(tokenized_text))
        tokenized_labels.append(tokenized_label)
    
    return tokenized_texts, tokenized_labels

# Tokenize the data
train_texts, train_labels = tokenize_data(tokenizer, train_df['Tokens'].tolist(), train_df['Labels'].tolist(), label_map)
val_texts, val_labels = tokenize_data(tokenizer, val_df['Tokens'].tolist(), val_df['Labels'].tolist(), label_map)
test_texts, test_labels = tokenize_data(tokenizer, test_df['Tokens'].tolist(), test_df['Labels'].tolist(), label_map)


In [10]:
from torch.nn.utils.rnn import pad_sequence

def createDataLoader(tokenized_texts, tokenized_labels ):
    # Convert lists to PyTorch tensors and pad
    tokenized_texts = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_texts], padding_value=0, batch_first=True)
    tokenized_labels = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_labels], padding_value=-100, batch_first=True)
    
    # Create attention masks
    attention_masks = (tokenized_texts != 0).long()
    
    # Create a TensorDataset
    train_dataset = TensorDataset(tokenized_texts, attention_masks, tokenized_labels)
    
    # Create DataLoader    
    return DataLoader(train_dataset, batch_size=32, shuffle=True)
    

In [11]:

train_dataloader = createDataLoader(train_texts, train_labels)
val_dataloader = createDataLoader(val_texts, val_labels)
test_dataloader = createDataLoader(test_texts, test_labels)

In [12]:
from transformers import BertForTokenClassification

# Initialize the BERT-based model for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_map)
).to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
epochs = 30
patience = 2

best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            val_loss += loss.item()
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Validation Loss: {avg_val_loss}')

    # Check if the validation loss is lower than the best one seen so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), f'checkpoint_epoch_{epoch+1}.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping!')
            break
        
torch.save(model.state_dict(), 'final_model.pt')
print('Training complete. Final model saved.')

OutOfMemoryError: CUDA out of memory. Tried to allocate 62.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.04 GiB is allocated by PyTorch, and 395.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from sklearn.metrics import classification_report

model.eval()  
total_loss = 0
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_masks, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Convert logits to predicted labels
        predictions = np.argmax(logits, axis=2)
        
        # Flatten the predictions and true labels to compute metrics
        all_predictions.extend(predictions.flatten())
        all_true_labels.extend(label_ids.flatten())

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_dataloader)
print(f"Test loss: {avg_loss}")

# Classification report
print(classification_report(all_true_labels, all_predictions, labels=list(label_map.values()), target_names=list(label_map.keys())))


In [None]:
def query_model(model, tokenizer, label_map, sentence, device):
    model.eval()  
    
    # Tokenize the new utterance using the tokenize_data function
    tokenized_texts, _ = tokenize_data(tokenizer, [sentence.split()], [['O']*len(sentence.split())], label_map) # Add dummy labels.
    
    # Convert to PyTorch tensors
    input_ids = torch.tensor(tokenized_texts).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = np.argmax(logits.detach().cpu().numpy(), axis=2)
    predicted_labels = [list(label_map.keys())[list(label_map.values()).index(p)] for p in predictions[0] if p != -100]

    return predicted_labels

new_utterance = "I would like to book a table for two at a Mexican restaurant."
predicted_labels = query_model(model, tokenizer, label_map, new_utterance, device)
print(predicted_labels)