In [19]:
from datasets import load_dataset

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [20]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [21]:
def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)

        # Check if there are slot values for this turn
        if i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(labels) and end_token_idx < len(labels):
                    # Label tokens using IOB format
                    labels[start_token_idx] = f"B-{act_slot_name}:{act_slot_value}"
                    for j in range(start_token_idx + 1, end_token_idx + 1):
                        labels[j] = f"I-{act_slot_name}:{act_slot_value}"
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data



In [22]:
import pandas as pd

def toDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_utterances(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
    


# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)



In [23]:
print(train_df.shape)
train_df.head()

(28928, 2)


Unnamed: 0,Tokens,Labels
0,"[i, need, a, place, to, dine, in, the, center,...","[O, O, O, O, O, O, O, O, B-area:centre, O, B-p..."
1,"[I, have, several, options, for, you;, do, you...","[O, O, B-choice:several, O, O, O, O, O, O, B-f..."
2,"[Any, sort, of, food, would, be, fine,, as, lo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[There, is, an, Afrian, place, named, Bedouin,...","[O, O, O, B-food:Afrian, O, O, B-name:Bedouin,..."
4,"[Sounds, good,, could, I, get, that, phone, nu...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-p..."


In [24]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, EncoderDecoderModel
from torch.utils.data import DataLoader, TensorDataset

In [25]:
all_labels = [label for sublist in train_df['Labels'].tolist() for label in sublist]
unique_labels = set(all_labels)

unique_labels.__sizeof__()

524488

In [26]:
label_map = {label: i for i, label in enumerate(unique_labels)}

In [27]:
# Tokenize the  data

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_data(tokenizer, text_sequences, label_sequences, label_map):
    tokenized_texts, tokenized_labels = [], []
    
    for tokens, labels in zip(text_sequences, label_sequences):
        tokenized_text, tokenized_label = [], []
        
        for word, label in zip(tokens, labels):
            # Tokenize each word and its corresponding label
            word_tokens = tokenizer.tokenize(word)
            tokenized_text.extend(word_tokens)
            tokenized_label.extend([label_map[label]] * len(word_tokens))  # Convert label to integer
        
        tokenized_texts.append(tokenizer.convert_tokens_to_ids(tokenized_text))
        tokenized_labels.append(tokenized_label)
    
    return tokenized_texts, tokenized_labels

# Tokenize the data
tokenized_texts, tokenized_labels = tokenize_data(tokenizer, train_df['Tokens'].tolist(), train_df['Labels'].tolist(), label_map)


In [28]:
from torch.nn.utils.rnn import pad_sequence

# Convert lists to PyTorch tensors and pad
tokenized_texts = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_texts], padding_value=0, batch_first=True)
tokenized_labels = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_labels], padding_value=-100, batch_first=True)

In [29]:
# Create attention masks
attention_masks = (tokenized_texts != 0).long()

# Create a TensorDataset
train_dataset = TensorDataset(tokenized_texts, attention_masks, tokenized_labels)  # attention_masks should be created similarly to tokenized_texts

# Create DataLoaders for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [30]:
from transformers import BertForTokenClassification

# Initialize the BERT-based model for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_map)  # Number of unique labels in your dataset
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)  # Assuming -100 is the padding value for labels

# Number of training epochs
n_epochs = 3

# Training Loop
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_dataloader:
        batch_texts, batch_attention_masks, batch_labels = bat
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch_texts, attention_mask=batch_attention_masks, labels=batch_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}")

KeyboardInterrupt: 