In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [None]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [None]:
print(train_data_filtered[0])

# Identifying the slots

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        # Tokenize the utterance and get the offset mappings
        encoded_input = tokenizer(utterance, add_special_tokens=False, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)
        offset_mapping = encoded_input['offset_mapping']
        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                # start_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[0] == span_start), None)
                # end_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[1] == span_end), None)
                
                # Utilize the offset_mapping to find the token index for the start and end of the span
                start_token_idx = None
                end_token_idx = None
                
                for idx, offset in enumerate(offset_mapping):
                    if start_token_idx is None and offset[0] == span_start:
                        start_token_idx = idx
                    if offset[1] == span_end:
                        end_token_idx = idx
                        break
                
                if start_token_idx is not None and end_token_idx is not None:
                    if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                        # Label tokens using IOB format with the actual ground truth slot value
                        labels[start_token_idx] = f"B-{act_slot_name}"
                        for j in range(start_token_idx + 1, end_token_idx + 1):
                            labels[j] = f"I-{act_slot_name}"
                    else:
                        print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")
            
            try:
                # if the prev_dialogue_act is not None, then we need to label the tokens that are part of the previous dialogue act
                prev_dialogue_act = data['dialogue_acts'][i-1]['dialog_act']['act_type'][0] if i > 0 and data['dialogue_acts'][i]['dialog_act']['act_type'][0] else ""
                current_dialogue_act = data['dialogue_acts'][i]['dialog_act']['act_type'][0] if data['dialogue_acts'][i]['dialog_act']['act_type'][0] else ""
            except IndexError as e:
                prev_dialogue_act = ""
                current_dialogue_act = ""
            
            dialogue_act_str = f"{prev_dialogue_act}|{current_dialogue_act}"
            
            act_tokens = tokenizer.tokenize(dialogue_act_str)
            act_labels = ['X'] * len(act_tokens)
            tokens = act_tokens + ['[SEP]'] + tokens  # Add a separator token between acts and the utterance
            labels = act_labels + ['X'] + labels  # Add an 'X' label for the separator token

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data


In [None]:
import pandas as pd
import numpy as np

def toDF(data, label_function=label_utterances):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_function(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)

In [None]:
train_df.to_excel("output.xlsx")  

In [None]:
print(train_df.shape)
print(train_df["Tokens"].iloc[9])
print(train_df["Labels"].iloc[9])

In [None]:

all_labels = [label for sublist in train_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in val_df['Labels'].tolist() for label in sublist]
all_labels += [label for sublist in test_df['Labels'].tolist() for label in sublist]
unique_labels = sorted(set(all_labels))

unique_labels.__sizeof__()

# We will ignore the 'X' label.
unique_labels.remove('X')

print(unique_labels)

In [None]:
label_map = {label: i for i, label in enumerate(unique_labels)}

In [None]:
def encode_pre_tokenized(pre_tokenized_tokens, tokenizer, max_length):

    # Add special tokens
    tokens = ['[CLS]'] + pre_tokenized_tokens + ['[SEP]']
    
    # Initialize the list of word IDs
    word_ids = []``

    # Word IDs should start from 1 as 0 is reserved for the [CLS] token
    current_word_id = 1
    
    for token in pre_tokenized_tokens:
        # Append the word ID to the list. If the token is a continuation of a word (`##` prefix), 
        # it gets the same word ID as the previous token.

        if token == "[SEP]" or token == "[CLS]":
            continue
        if token.startswith('##'):
            word_ids.append(current_word_id)
        else:
            word_ids.append(current_word_id)
            current_word_id += 1


    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Create the attention mask
    attention_mask = [1] * len(input_ids)

    # Check if padding is necessary
    padding_length = max_length - len(input_ids)
    if padding_length > 0:  # pad the sequence if it is shorter than max_length
        input_ids += [tokenizer.pad_token_id] * padding_length
        attention_mask += [0] * padding_length
    # return a dict 
    return {"input_ids": input_ids, "attention_mask": attention_mask, "word_ids": word_ids}

In [None]:
def decode_input_ids(input_ids, tokenizer):
    # Convert input IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    return tokens.replace('##', '')

In [None]:
def create_dataset(df, tokenizer, label_map):
    # Lists to store the tokenized inputs and labels
    input_ids = []
    attention_masks = []
    label_ids = []

    # Iterate over the DataFrame rows
    for _, row in df.iterrows():
        tokens = row['Tokens']
        labels = row['Labels']
        
        encoded_input = encode_pre_tokenized(tokens, tokenizer, 256)
        label_ids_for_tokens = [label_map.get(label, -100) for label in labels] # ignore the 'X' label
        
        # Create an empty array to hold the final label IDs
        aligned_labels = np.ones(len(encoded_input['input_ids']), dtype=int) * -100

        # Set labels using the word_ids to align them with tokens
        for i, word_id in enumerate(encoded_input["word_ids"]):
            if word_id is not None and tokens[word_id] not in ["[CLS]", "[SEP]"]:
                aligned_labels[i] = label_ids_for_tokens[word_id]

        # Append the results to the lists
        input_ids.append(encoded_input['input_ids'])
        attention_masks.append(encoded_input['attention_mask'])
        label_ids.append(aligned_labels.tolist())

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)
    label_ids = torch.tensor(label_ids, dtype=torch.long)

    # Create the TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, label_ids)

    return dataset

In [None]:
train_dataloader = DataLoader(create_dataset(train_df,tokenizer,label_map), batch_size=16, shuffle=True)
val_dataloader = DataLoader(create_dataset(val_df,tokenizer,label_map), batch_size=16, shuffle=True)
test_dataloader = DataLoader(create_dataset(test_df,tokenizer,label_map), batch_size=16, shuffle=True)

In [None]:
from transformers import BertForTokenClassification, BertConfig

# Define the number of labels
num_labels = len(label_map)  # Make sure label_map is defined in your environment

# Create a configuration object with `num_labels` set
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Create the model with the standard token classification head
model = BertForTokenClassification(config).to(device)


In [None]:
from tqdm.auto import tqdm

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
epochs = 1
patience = 2


# Initialize the early stopping counter
best_val_loss = float('inf')
patience_counter = 0

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs} Training', leave=False)
    
    # Training phase
    for batch in train_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_progress_bar.set_postfix(train_loss=loss.item())
    
    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    val_loss = 0
    val_progress_bar = tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{epochs} Validation', leave=False)
    for batch in val_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            val_loss += loss.item()
            val_progress_bar.set_postfix(val_loss=loss.item())
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} | Validation Loss: {avg_val_loss}')

    # Check if the validation loss is lower than the best one seen so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), f'checkpoint_epoch_{epoch+1}.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping!')
            break
print('Training complete. Final model saved.')

In [None]:
!pip install seqeval
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

# Reverse the label map to translate from numeric to string labels
label_map_reverse = {v: k for k, v in label_map.items()}

model.eval()
total_loss = 0
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_masks, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Convert logits to token predictions
        predictions = np.argmax(logits, axis=-1)

        # For each item in the batch...
        for i in range(b_input_ids.size(0)):
            # Skip predictions for tokens with label_id == -100
            pred_label_sequence = []
            true_label_sequence = []
            for j, (pred_id, label_id) in enumerate(zip(predictions[i], label_ids[i])):
                if b_attention_masks[i][j] != 0 and label_id != -100:
                    pred_label_sequence.append(label_map_reverse.get(pred_id, 'O'))  # Default to 'O' if key is not found
                    true_label_sequence.append(label_map_reverse[label_id])

            # Ensure the true and predicted sequences have the same length
            if len(true_label_sequence) != len(pred_label_sequence):
                print(f"Length mismatch in sequence {i}: true labels {len(true_label_sequence)} vs. predicted labels {len(pred_label_sequence)}")
                # Output the actual sequences to help diagnose the issue
                print("True labels:", true_label_sequence)
                print("Pred labels:", pred_label_sequence)
                continue

            # ...extend the true labels and predicted labels lists
            all_true_labels.append(true_label_sequence)
            all_predictions.append(pred_label_sequence)

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_dataloader)
print(f"Test loss: {avg_loss}")

# Use seqeval to compute a classification report
seqeval_report = seqeval_classification_report(all_true_labels, all_predictions)
print(seqeval_report)


In [None]:
def query_model(model, tokenizer, label_map, utterance, device):
    model.eval()  
    
    
    # Reverse the label map to translate from numeric IDs to string labels
    label_map_reverse = {v: k for k, v in label_map.items()}

    
    # Tokenize the new utterance directly with the tokenizer
    encoded_input = tokenizer(
        utterance,
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='pt'  # Return PyTorch tensors directly
    )
    
    # Move tensors to the correct device
    input_ids = encoded_input['input_ids'].to(device)
    attention_masks = encoded_input['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
        logits = outputs.logits

    # Convert logits to probabilities and get the most likely label index
    predictions = torch.argmax(logits, dim=2).squeeze().tolist()

    # Map predictions to label strings, ignoring -100 and padding tokens
    predicted_labels = [label_map_reverse.get(label_id) for label_id, mask in zip(predictions, attention_masks.squeeze().tolist()) if mask != 0 and label_id != -100][1:-1]

    return predicted_labels

# Example usage
new_utterance = "Can I book a table for five at a Spanish restaurant, the restaurant must be cheap?"
predicted_labels = query_model(model, tokenizer, label_map, new_utterance, device)
print(predicted_labels)


# Mapping slots to values

In [None]:
def label_slots(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  

        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                    # Label tokens using IOB format with the actual ground truth slot value
                    slot = f"{utterance[span_start:span_end]}"
                    value = act_slot_value
                    
                    labeled_data.append((slot, value))
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")
                

        # Store the tokenized utterance along with its labels
        
        
    return labeled_data

In [None]:
import pandas as pd
def slotsToDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_slots(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Slots', 'Values'])
    
# Create DataFrames of labeled utterances
train_df = slotsToDF(train_data_filtered)
test_df = slotsToDF(test_data_filtered)
val_df = slotsToDF(val_data_filtered)

In [None]:
mismatched_rows = train_df[train_df['Slots'] != train_df['Values']]

# Display the filtered rows
display(mismatched_rows)

In [None]:
def map_indiferece(text):
    
    indiferences = ["don't care", "any", "don't mind", "no preference", "whatever", "doesn't matter"]
    
    for indiference in indiferences:
        if indiference == text:
            return "dontcare"
        else:
            return text
    
    

def text_to_num(text):
    """
    Converts text numbers up to 20 into their integer representations as strings.
    If the provided text is not a number or out of range, it returns None.
    """
    text_to_num_dict = {
        'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 
        'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 
        'ten': '10', 'eleven': '11', 'twelve': '12', 'thirteen': '13', 
        'fourteen': '14', 'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 
        'eighteen': '18', 'nineteen': '19', 'twenty': '20'
    }
    # convert to lower case to make the function case-insensitive
    text = text.lower()
    return text_to_num_dict.get(text, False)


# Define the post process function
def post_process_slot_value(slot_value):
    slot_value = slot_value.lower().strip()
    # Check if the slot value is a number
    number = text_to_num(slot_value)
    if number:
        return number
    
    slot_value = map_indiferece(slot_value)
    
    return slot_value

# Assuming train_df is your DataFrame
# Apply the post_process_slot_value function to each value in the 'Slots' column
train_df['Slots'] = train_df['Slots'].apply(post_process_slot_value)

# Now you can filter out the mismatched rows
mismatched_rows = train_df[train_df['Slots'] != train_df['Values'].apply(post_process_slot_value)]

# Assuming you want to display these mismatched rows
display(mismatched_rows)

In [None]:
map_indiferece("any")

In [None]:
from typing import Dict, List

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def label_utterances_with_gt_values(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        # Tokenize the utterance and get the offset mappings
        encoded_input = tokenizer(utterance, add_special_tokens=False, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)
        offset_mapping = encoded_input['offset_mapping']
        # Check if there are slot values for this turn
        if 'dialogue_acts' in data and i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                # start_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[0] == span_start), None)
                # end_token_idx = next((idx for idx, offset in enumerate(offset_mapping) if offset[1] == span_end), None)
                
                # Utilize the offset_mapping to find the token index for the start and end of the span
                start_token_idx = None
                end_token_idx = None
                
                for idx, offset in enumerate(offset_mapping):
                    if start_token_idx is None and offset[0] == span_start:
                        start_token_idx = idx
                    if offset[1] == span_end:
                        end_token_idx = idx
                        break
                
                if start_token_idx is not None and end_token_idx is not None:
                    if start_token_idx < len(tokens) and end_token_idx < len(tokens):
                        # Label tokens using IOB format with the actual ground truth slot value
                        labels[start_token_idx] = f"B-{act_slot_name}:{act_slot_value}"
                        for j in range(start_token_idx + 1, end_token_idx + 1):
                            labels[j] = f"I-{act_slot_name}:{act_slot_value}"
                    else:
                        print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")
            
            try:
                # if the prev_dialogue_act is not None, then we need to label the tokens that are part of the previous dialogue act
                prev_dialogue_act = data['dialogue_acts'][i-1]['dialog_act']['act_type'][0] if i > 0 and data['dialogue_acts'][i]['dialog_act']['act_type'][0] else ""
                current_dialogue_act = data['dialogue_acts'][i]['dialog_act']['act_type'][0] if data['dialogue_acts'][i]['dialog_act']['act_type'][0] else ""
            except IndexError as e:
                prev_dialogue_act = ""
                current_dialogue_act = ""
            
            dialogue_act_str = f"{prev_dialogue_act}|{current_dialogue_act}"
            
            act_tokens = tokenizer.tokenize(dialogue_act_str)
            act_labels = ['X'] * len(act_tokens)
            tokens = act_tokens + ['[SEP]'] + tokens  # Add a separator token between acts and the utterance
            labels = act_labels + ['X'] + labels  # Add an 'X' label for the separator token

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data


class DialogSlotMemory():
    slot_list_dict: Dict[str, List[str]] = {}

    def __init__(self):
        self.slot_list_dict = {}
    
    def add_slot(self, slot_name: str, slot_value: str):
        if slot_name not in self.slot_list_dict:
            self.slot_list_dict[slot_name] = []
        self.slot_list_dict[slot_name].append(slot_value)
    
    def get_slot_values(self, slot_name: str):
        return self.slot_list_dict[slot_name]

    def get_most_recent_slot_value(self, slot_name: str):
        return self.slot_list_dict[slot_name][-1] if slot_name in self.slot_list_dict else None

    def get_all_slot_values(self):
        return self.slot_list_dict

    def get_all_slot_names(self):
        return self.slot_list_dict.keys()
        
class ConversationDataset:
    id_dialog: str
    memory: DialogSlotMemory
    dataset: TensorDataset

def generate_separate_dialogue_datasets(data, label_function=label_utterances) -> List[ConversationDataset]:
    """
    Generates separate datasets for each dialogue in the provided data.
    
    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.
    
    Returns:
    - List of datasets, one for each dialogue.
    """
    datasets = []
    for dialogue in data:
        # Create a dataset for the current dialogue
        dataset = ConversationDataset()
        dataset.memory = DialogSlotMemory()
        dataset.id_dialog = dialogue['dialogue_id']
        dataset.dataset = create_dataset(toDF([dialogue], label_function=label_function), tokenizer, label_map)
        datasets.append(dataset)
    return datasets


def remove_tokens_before_sep(ids: torch.Tensor, tokenizer: BertTokenizerFast):
    """
    Removes all the tokens before the SEP token, including the SEP token itself.

    Parameters:
    - ids: list of token IDs.
    
    Returns:
    - List of token IDs with sep and all tokens before it removed.
    """
    sep_token_id = tokenizer.sep_token_id
    sep_token_index = (ids == sep_token_id).nonzero(as_tuple=True)[0][0]
    return ids[sep_token_index+1:]

In [None]:
!pip install seqeval
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

# Load model from checkpoint
model.load_state_dict(torch.load('checkpoint_epoch_1.pt'))
model.eval()


# Separate the test data into separate datasets for each dialogue
test_datasets = generate_separate_dialogue_datasets(test_data_filtered)

# Reverse the label map to translate from numeric to string labels
label_map_reverse = {v: k for k, v in label_map.items()}

model.eval()
total_loss = 0
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for dataset in test_datasets:
        # Get the input_ids, attention_masks and labels from the dataset
        input_ids = dataset.dataset.tensors[0]
        attention_masks = dataset.dataset.tensors[1]
        labels = dataset.dataset.tensors[2]

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        

        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Convert logits to token predictions
        predictions = np.argmax(logits, axis=-1)

        # For each item in the batch...
        for i in range(input_ids.size(0)):
            # Skip predictions for tokens with label_id == -100
            pred_label_sequence = []
            true_label_sequence = []
            for j, (pred_id, label_id) in enumerate(zip(predictions[i], label_ids[i])):
                if attention_masks[i][j] != 0 and label_id != -100:
                    pred_label_sequence.append(label_map_reverse.get(pred_id, 'O'))

                    # Get the true label from the dataset
                    true_label_id = label_ids[i][j]
                    true_label_sequence.append(label_map_reverse[true_label_id])

            # Ensure the true and predicted sequences have the same length
            if len(true_label_sequence) != len(pred_label_sequence):
                print(f"Length mismatch in sequence {i}: true labels {len(true_label_sequence)} vs. predicted labels {len(pred_label_sequence)}")
                # Output the actual sequences to help diagnose the issue
                print("True labels:", true_label_sequence)
                print("Pred labels:", pred_label_sequence)
                continue
                
            # ...extend the true labels and predicted labels lists
            all_true_labels.append(true_label_sequence)
            all_predictions.append(pred_label_sequence)
            
            # Map slot values to slot names based on the predicted labels and add them to the memory
            # Skip all the tokens before (and including) the [SEP] token           
            ids = dataset.dataset.tensors[0][i][1:]
            ids = remove_tokens_before_sep(ids, tokenizer)
            for token, pred_label in zip(ids, pred_label_sequence):
                if pred_label != 'O':
                    slot_name = pred_label[2:]
                    
                    # Get the slot value
                    slot_value = tokenizer.convert_tokens_to_string([tokenizer.convert_ids_to_tokens(token.item())])                    
                    dataset.memory.add_slot(slot_name, slot_value)
            
            # Print the memory for the current dialogue
            print(f"Memory for dialogue {dataset.id_dialog}: {dataset.memory.get_all_slot_values()}")

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_datasets)
print(f"Test loss: {avg_loss}")

# Use seqeval to compute a classification report
seqeval_report = seqeval_classification_report(all_true_labels, all_predictions)
print(seqeval_report)

In [None]:
# Full evaluation using memory substitution for "same" and using concatenating values and filling the slots

!pip install seqeval
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

# Load model from checkpoint
model.load_state_dict(torch.load('checkpoint_epoch_1.pt'))
model.eval()


# Separate the test data into separate datasets for each dialogue
test_datasets = generate_separate_dialogue_datasets(test_data_filtered)
test_datasets_with_values = generate_separate_dialogue_datasets(test_data_filtered, label_function=label_utterances_with_gt_values)

# Reverse t
# The label map to translate from numeric to string labels
label_map_reverse = {v: k for k, v in label_map.items()}

model.eval()
total_loss = 0
all_predictions = []
all_true_labels = []
all_predictions_with_values = []

with torch.no_grad():
    for dataset in test_datasets:
        # Get the input_ids, attention_masks and labels from the dataset
        input_ids = dataset.dataset.tensors[0]
        attention_masks = dataset.dataset.tensors[1]
        labels = dataset.dataset.tensors[2]

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        

        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        # Convert logits to token predictions
        predictions = np.argmax(logits, axis=-1)

        # For each item in the batch...
        for i in range(input_ids.size(0)):
            # Skip predictions for tokens with label_id == -100
            pred_label_sequence = []
            true_label_sequence = []
            for j, (pred_id, label_id) in enumerate(zip(predictions[i], label_ids[i])):
                if attention_masks[i][j] != 0 and label_id != -100:
                    # Get the slot name from the predicted label
                    pred_label_sequence.append(label_map_reverse.get(pred_id, 'O'))

                    # Get the true label from the dataset
                    true_label_id = label_ids[i][j]
                    true_label_sequence.append(label_map_reverse[true_label_id])

            # Ensure the true and predicted sequences have the same length
            if len(true_label_sequence) != len(pred_label_sequence):
                print(f"Length mismatch in sequence {i}: true labels {len(true_label_sequence)} vs. predicted labels {len(pred_label_sequence)}")
                # Output the actual sequences to help diagnose the issue
                print("True labels:", true_label_sequence)
                print("Pred labels:", pred_label_sequence)
                continue
                
            # ...extend the true labels and predicted labels lists
            all_true_labels.append(true_label_sequence)
            all_predictions.append(pred_label_sequence)
            
            # Map slot values to slot names based on the predicted labels and add them to the memory
            # Skip all the tokens before (and including) the [SEP] token           
            ids = dataset.dataset.tensors[0][i][1:]
            ids = remove_tokens_before_sep(ids, tokenizer)

            pred_label_value_sequence = []
            for token_id, pred_label in zip(ids, pred_label_sequence):
                if pred_label != 'O':
                    slot_name = pred_label[2:]
                    
                    # Get the slot value
                    slot_value = decode_input_ids(token_id.item(), tokenizer)
                    dataset.memory.add_slot(slot_name, slot_value)

                    pred_label_value_sequence.append(f"{pred_label}:{slot_value}")
                
            all_predictions_with_values.append(pred_label_value_sequence)
            
            # Print the memory for the current dialogue
            print(f"Memory for dialogue {dataset.id_dialog}: {dataset.memory.get_all_slot_values()}")

# Calculate average loss over all the batches
avg_loss = total_loss / len(test_datasets)
print(f"Test loss: {avg_loss}")

# Use seqeval to compute a classification report
seqeval_report = seqeval_classification_report(all_true_labels, all_predictions)
print(seqeval_report)