In [66]:
from datasets import load_dataset

# Load the Train, Validation and Test data

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [67]:
def filterDomains(data):
    """
    Filters a list of dictionaries by only including entries with services
    either "restaurant" or "hotel" and having only one service.

    Parameters:
    - data: list of dictionaries containing a "services" key, which is a list of services.

    Returns:
    - List of filtered dictionaries.
    """
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel"})]

# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [68]:
def label_utterances(dialogue):
    labeled_data = []
    data = dialogue['turns']
    
    # Loop through each turn in the dialogue
    for i, turn_id in enumerate(data['turn_id']):
        utterance = data['utterance'][i]
        tokens = utterance.split()  
        labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)

        # Check if there are slot values for this turn
        if i < len(data['dialogue_acts']):
            dialogue_act = data['dialogue_acts'][i]
            span_info = dialogue_act.get('span_info', {})
            for act_slot_name, act_slot_value, span_start, span_end in zip(
                    span_info.get('act_slot_name', []),
                    span_info.get('act_slot_value', []),
                    span_info.get('span_start', []),
                    span_info.get('span_end', [])):
                
                # Find the tokens that correspond to the start and end indices
                start_token_idx = len(utterance[:span_start].split())
                end_token_idx = len(utterance[:span_end].split()) - 1

                if start_token_idx < len(labels) and end_token_idx < len(labels):
                    # Label tokens using IOB format
                    labels[start_token_idx] = f"B-{act_slot_name}:{act_slot_value}"
                    for j in range(start_token_idx + 1, end_token_idx + 1):
                        labels[j] = f"I-{act_slot_name}:{act_slot_value}"
                else:
                    print(f"Warning: Index out of range for utterance '{utterance}' with span {span_start}-{span_end}")

        # Store the tokenized utterance along with its labels
        labeled_data.append((tokens, labels))
        
    return labeled_data



In [72]:
import pandas as pd

def toDF(data):
    all_labeled_data = []
    for dialogue in data:
        all_labeled_data.extend(label_utterances(dialogue))
    return pd.DataFrame(all_labeled_data, columns=['Tokens', 'Labels'])
    
    


# Create DataFrames of labeled utterances
train_df = toDF(train_data_filtered)
test_df = toDF(test_data_filtered)
val_df = toDF(val_data_filtered)



In [75]:
print(train_df.shape)
train_df.head()

(28928, 2)


Unnamed: 0,Tokens,Labels
0,"[i, need, a, place, to, dine, in, the, center,...","[O, O, O, O, O, O, O, O, B-area:centre, O, B-p..."
1,"[I, have, several, options, for, you;, do, you...","[O, O, B-choice:several, O, O, O, O, O, O, B-f..."
2,"[Any, sort, of, food, would, be, fine,, as, lo...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[There, is, an, Afrian, place, named, Bedouin,...","[O, O, O, B-food:Afrian, O, O, B-name:Bedouin,..."
4,"[Sounds, good,, could, I, get, that, phone, nu...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-p..."
