# Import dependencies and dataset

In [31]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
from json_repair import repair_json
import numpy as np
import pickle
import os
import heapq
import tqdm
import random

In [2]:
from huggingface_hub import login
login("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")

In [3]:
dataset = load_dataset("pirxus/spokenwoz-whisper")
dataset = dataset.remove_columns("audio")

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [15]:
print(dataset)
dataset['train'][5]

DatasetDict({
    train: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 73950
    })
    dev: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 9104
    })
    test: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 17652
    })
})


{'wav_id': 'MUL0003',
 'turn_index': 10,
 'text': 'I will go there on Friday.',
 'agent_text': 'and most of the time do like to arrive there.',
 'domains': "['restaurant']",
 'slots': "{'restaurant': {'day': 'Friday', 'people': '1', 'area': 'centre', 'food': 'Indian'}}",
 'context': {'turn_index': [0, 2, 4, 6, 8],
  'text': ['Hello, is this Customer Service Center?',
   "Well, I'm looking for a place to dine. Do you have any recommendation for me?",
   "Well, I'd like to stay in the place with convenient transportation.",
   "I'd like to try some Indian food.",
   'Yes, please. Book a table for one people.'],
  'agent_text': ['Yes, this is Cosmos Service Center. How may I help?',
   'Of course, there are plenty of restaurants. Do you have any specific area?',
   'Okay, do you have any specific food type?',
   'Let me check it for you. Yes, we got you a panel located in the center of the city, could meet your requirement. Do you want to book table?',
   'Okay, and what day would you lik

# Sentence embedder

In [19]:
embedder_d2f = SentenceTransformer('sergioburdisso/dialog2flow-joint-bert-base')
embedder_labse = SentenceTransformer('sentence-transformers/LaBSE')

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

In [17]:
def extract_dialogue(sample, only_user=False, include_state=True):
    dialogue = ""
    for j in range(len(sample["context"]["text"])):
        if only_user:
            dialogue += " " + sample["context"]["text"][j]
        else:
            dialogue += " User: " + sample["context"]["text"][j]
            dialogue += " Agent: " + sample["context"]["agent_text"][j]
    if not only_user:
        dialogue += " User:"
    dialogue += " " + sample["text"]
    if include_state:
        dialogue += " Domains: " + sample["domains"]
        dialogue += " Slots: " + sample["slots"]
    return dialogue

In [21]:
def embed_dataset(embedder, dataset, path, batch_size=128):
    #If the dataset was already embedded, load it from a file to save time
    if os.path.exists(path):
        with open(path, "rb") as f:
            data = pickle.load(f)
        return data
    embedded_rows = [
    {
        'wav_id': sample['wav_id'],
        'turn_index': sample['turn_index'],
        'embedding': embedding
    }
    for batch_start in range(0, len(dataset), batch_size)
    for sample, embedding in zip(
        dataset[batch_start:batch_start+batch_size],
        embedder.encode(
            [extract_dialogue(sample) for sample in dataset[batch_start:batch_start+batch_size]],
            show_progress_bar=False
        )
    )]
    with open(path, "wb") as f:
        pickle.dump(embedded_rows, f)
    return embedded_rows 

print("Loading D2F embedded dataset...")
embedded_train_d2f = embed_dataset(embedder_d2f, list(dataset['train'])[:512], "embedded_train.d2f")
print("Loading LaBSE embedded dataset...")
embedded_train_labse = embed_dataset(embedder_labse, list(dataset['train'])[:512], "embedded_train.labse")

Loading D2F embedded dataset
Loading LaBSE embedded dataset


In [22]:
def find_k_most_similar(sample, k, embedder, dataset, embedded_dataset):
    sample_emb = embedder.encode(extract_dialogue(sample))
    
    M = np.stack([row['embedding'] for row in embedded_dataset])
    s = np.array(sample_emb)
    
    dists = np.linalg.norm(M - s, axis=1) # L2 distance
    dists[dists == 0] = np.inf # Ignore exact match

    idx = np.argpartition(dists, k)[:k]
    idx = idx[np.argsort(dists[idx])]

    samples   = [dataset[i] for i in idx]
    distances = dists[idx].tolist()
    return samples, distances

#find_k_most_similar(dataset['train'][120], 3, embedder_d2f, dataset['train'], embedded_train_d2f)

# Prompt creation

In [26]:
def create_prompt(sample, k, embedder_name, only_user=False):
    assert(embedder_name == "D2F" or embedder_name == "LaBSE")
    if embedder_name == "D2F":
        embedder = embedder_d2f
        embedded_dataset = embedded_train_d2f
    else:
        embedder = embedder_labse
        embedded_dataset = embedded_train_labse
        
    (demonstrations, _) = find_k_most_similar(sample, k, embedder, dataset['train'], embedded_dataset)
    prompt = 'Instruction: Identify the slot keys and values.\n'
    for i in range(k):
        prompt += extract_dialogue(demonstrations[k-i-1], only_user, include_state=True) + "\n"
    #New sample
    prompt += extract_dialogue(sample, only_user, include_state=False)
    prompt += " Domains: " + sample["domains"]
    prompt += " Slots: "
    return prompt
    
#create_prompt(dataset['train'][120], 3, "D2F")   

# LLM and testing

In [36]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.pad_token_id = tokenizer.eos_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
sample = dataset['train'][100]
prompt = create_prompt(sample, 3, "D2F", True)
print(prompt)

Instruction: Identify the slot keys and values.
 Hello? Yes, I'm looking for the information in Cambridge. I'm looking for a restaurant. I suppose the restaurant should be in the expensive price range. I will travel in the eastern part of the city, so I prefer to eat there. That's right. Do you have any recommendation of the restaurant? Okay. Okay, Mano. Okay, I suppose I would like to try the second one. No, but I need some other information. My know their post code Go ahead. Thank you so much. May I know the address? God, Ed, thank you very much. Also, I'm looking for a place to stay. Well, I suppose the hotel should be in the type of guest house. And I suppose we should be in the same area of the restaurant. Also, I suppose it should have a style of floor. Domains: ['hotel', 'restaurant'] Slots: {'hotel': {'area': 'East', 'stars': '4', 'type': 'guest house'}, 'restaurant': {'area': 'East', 'name': 'Royal Standard'}}
 Hello? Yes, I'm looking for the information in Cambridge. I'm look

In [30]:
def repair_output(output):
    try:
        obj = repair_json(output)
        return obj
    except Exception as e:
        print("Exception during json repair: ", e)
        return ""
    return output

In [32]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
output_string = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
print(repair_output(output_string))
print("Ground truth: ", sample["slots"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"hotel": {"pricerange": "expensive", "internet": "free", "parking": "lots"}}
Ground truth:  {'hotel': {'internet': 'yes', 'parking': 'yes', 'pricerange': 'expensive', 'type': 'hotel'}}


# Metrics and evaluation

In [33]:
def joint_goal_accuracy(results, ground_truths):
    return 0.4

def slot_error_rate(results, ground_truths):
    return 0.3


In [42]:
def test_model_utterances(samples, k, embedder_name, device):
    results = [[], []]
    for sample in tqdm.tqdm(samples, desc="Processing samples"):
        prompts = [create_prompt(sample, k, embedder_name, only_user=True), create_prompt(sample, k, embedder_name, only_user=False)]
        for i in tqdm.tqdm(range(len(prompts)), desc="Processing prompts", leave=False):
            inputs = tokenizer(prompts[i], return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=100)
            result = repair_output(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
            results[i].append(result)
    return results

def test_model_demonstrations(samples, ks, only_user, embedder_name, device):
    results = [[] for k in ks]
    for sample in tqdm.tqdm(samples, desc="Processing samples"):
        prompts = [create_prompt(sample, k, embedder_name, only_user=only_user) for k in ks]
        for i in tqdm.tqdm(range(len(prompts)), desc="Processing prompts", leave=False):
            inputs = tokenizer(prompts[i], return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=100)
            result = repair_output(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
            results[i].append(result)
    return results

def test_model_demonstrations(samples, k, only_user, device):
    results = [[], []]
    for sample in tqdm.tqdm(samples, desc="Processing samples"):
        prompts = [create_prompt(sample, k, embedder_name="D2F", only_user=only_user), create_prompt(sample, k, embedder_name="LaBSE", only_user=only_user)]
        for i in tqdm.tqdm(range(len(prompts)), desc="Processing prompts", leave=False):
            inputs = tokenizer(prompts[i], return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=100)
            result = repair_output(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
            results[i].append(result)
    return results

In [37]:
random.seed(0)

def evaluate_model(dataset, n, device, test_type):
    assert(test_type == "utterances" or test_type == "demonstrations" or test_type == "embedder")
    print("Selecting samples...")
    positions = random.sample(range(len(dataset)), n)
    samples = [dataset[p] for p in positions]
    print("Testing model...")
    if test_type == "utterances":
        results = test_model_utterances(samples, k=3, embedder_name="D2F", device=device)
    elif test_type == "demonstrations":
        results = test_model_demonstrations(samples, k=[1, 3, 5, 10], only_user=True, embedder_name="D2F", device=device)
    elif test_type == "demonstrations":
        results = test_model_embedder(samples, k=3, only_user = True, device=device)
    gts = [sample["slots"] for sample in samples]
    print("Evaluating joint goal accuracies...")
    jgas = [joint_goal_accuracy(r, gts) for r in results]
    print("Evaluating slot error rates...")
    sers = [slot_error_rate(r, gts) for r in results]
    return jgas, sers

In [43]:
jgas_utterances, sers_utterances = evaluate_model(dataset['test'], 2, device, test_type="utterances")
#jgas_demonstrations, sers_demonstrations = evaluate_model(dataset['test'], 1, device, test_type="demonstrations")
#jgas_embedder, sers_embedder = evaluate_model(dataset['test'], 1, device, test_type="embedder")

Selecting samples...
Testing model...


Processing samples:   0%|                                                                        | 0/1 [00:00<?, ?it/s]
Processing prompts:   0%|                                                                        | 0/2 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing prompts:  50%|███████████████████████████████▌                               | 1/2 [03:14<03:14, 194.32s/it][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing prompts: 100%|███████████████████████████████████████████████████████████████| 2/2 [05:33<00:00, 162.17s/it][A
Processing samples: 100%|███████████████████████████████████████████████████████████████| 1/1 [05:58<00:00, 358.70s/it][A

Evaluating joint goal accuracies...
Evaluating slot error rates...
Utterances results: 
[0.4, 0.4]
[0.3, 0.3]





In [None]:
print("Utterances results: ")
print(jgas_utterances)
print(sers_utterances)
#print("Demonstrations results: ")
#print(jgas_demonstrations)
#print(sers_demonstrations)
#print("Embedder results: ")
#print(jgas_embedder)
#print(sers_embedder)

Tomiinek/MultiWOZ_Evaluation.git