# Install dependencies and dataset

In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast
import json
import numpy as np
import pickle
import os
import heapq

In [2]:
from huggingface_hub import login
login("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")

In [3]:
dataset = load_dataset("pirxus/spokenwoz-whisper")
dataset = dataset.remove_columns("audio")

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
print(dataset)
dataset['train'][5]

DatasetDict({
    train: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 73950
    })
    dev: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 9104
    })
    test: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 17652
    })
})


{'wav_id': 'MUL0003',
 'turn_index': 10,
 'text': 'I will go there on Friday.',
 'agent_text': 'and most of the time do like to arrive there.',
 'domains': "['restaurant']",
 'slots': "{'restaurant': {'day': 'Friday', 'people': '1', 'area': 'centre', 'food': 'Indian'}}",
 'context': {'turn_index': [0, 2, 4, 6, 8],
  'text': ['Hello, is this Customer Service Center?',
   "Well, I'm looking for a place to dine. Do you have any recommendation for me?",
   "Well, I'd like to stay in the place with convenient transportation.",
   "I'd like to try some Indian food.",
   'Yes, please. Book a table for one people.'],
  'agent_text': ['Yes, this is Cosmos Service Center. How may I help?',
   'Of course, there are plenty of restaurants. Do you have any specific area?',
   'Okay, do you have any specific food type?',
   'Let me check it for you. Yes, we got you a panel located in the center of the city, could meet your requirement. Do you want to book table?',
   'Okay, and what day would you lik

# Sentence embedder

In [5]:
embedder = SentenceTransformer('sergioburdisso/dialog2flow-joint-bert-base')

In [6]:
def extract_dialogue(sample, only_user=False, include_state=True):
    dialogue = ""
    for j in range(len(sample["context"]["text"])):
        if only_user:
            dialogue += " " + sample["context"]["text"][j]
        else:
            dialogue += " User: " + sample["context"]["text"][j]
            dialogue += " Agent: " + sample["context"]["agent_text"][j]
    if not only_user:
        dialogue += " User:"
    dialogue += " " + sample["text"]
    if include_state:
        dialogue += " Domains: " + sample["domains"]
        dialogue += " Slots: " + sample["slots"]
    return dialogue

In [7]:
def embed_dataset(embedder, dataset, path, batch_size=128):
    #If the dataset was already embedded, load it from a file to save time
    if os.path.exists(path):
        with open(path, "rb") as f:
            data = pickle.load(f)
        return data
    embedded_rows = [
    {
        'wav_id': sample['wav_id'],
        'turn_index': sample['turn_index'],
        'embedding': embedding
    }
    for batch_start in range(0, len(dataset), batch_size)
    for sample, embedding in zip(
        dataset[batch_start:batch_start+batch_size],
        embedder.encode(
            [extract_dialogue(sample) for sample in dataset[batch_start:batch_start+batch_size]],
            show_progress_bar=False
        )
    )]
    with open(path, "wb") as f:
        pickle.dump(embedded_rows, f)
    return embedded_rows 

embedded_train = embed_dataset(embedder, list(dataset['train'])[:512], "embedded_train.dataset")

In [8]:
def find_k_most_similar(sample, k, embedder, dataset, embedded_dataset):
    sample_emb = embedder.encode(extract_dialogue(sample))
    
    M = np.stack([row['embedding'] for row in embedded_dataset])
    s = np.array(sample_emb)
    
    dists = np.linalg.norm(M - s, axis=1) # L2 distance
    dists[dists == 0] = np.inf # Ignore exact match

    idx = np.argpartition(dists, k)[:k]
    idx = idx[np.argsort(dists[idx])]

    samples   = [dataset[i] for i in idx]
    distances = dists[idx].tolist()
    return samples, distances

#find_k_most_similar(dataset['train'][120], 8, embedder, dataset['train'], embedded_train)

# Prompt creation

In [35]:
def create_prompt(sample, k, only_user=False):
    (demonstrations, _) = find_k_most_similar(sample, k, embedder, dataset['train'], embedded_train)
    prompt = 'Instruction: Identify the slot keys and values.\n'
    for i in range(k):
        prompt += extract_dialogue(demonstrations[i], only_user, include_state=True) + "\n"
    #New sample
    prompt += extract_dialogue(sample, only_user, include_state=False)
    prompt += " Domains: " + sample["domains"]
    prompt += " Slots: "
    return prompt
    
#create_prompt(dataset['train'][120], 3)   

# LLM and testing

In [10]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:01<?, ?B/s]

In [38]:
sample = dataset['train'][100]
prompt = create_prompt(sample, 3, True)
print(prompt)
messages = [
    #{"role": "system", "content": "Generate slot keys and values."},
    {"role": "user", "content": prompt}
]

Instruction: Identify the slot keys and values.
 Hi. Hi, I need to stay overnight in Cambridge. Please find me a hotel. I'd like to find an expensive one. Domains: ['hotel'] Slots: {'hotel': {'pricerange': 'expensive', 'type': 'hotel'}}
 Hello? Yes, I'm looking for the information in Cambridge. I'm looking for a restaurant. I suppose the restaurant should be in the expensive price range. I will travel in the eastern part of the city, so I prefer to eat there. That's right. Do you have any recommendation of the restaurant? Okay. Okay, Mano. Okay, I suppose I would like to try the second one. No, but I need some other information. My know their post code Go ahead. Thank you so much. May I know the address? God, Ed, thank you very much. Also, I'm looking for a place to stay. Well, I suppose the hotel should be in the type of guest house. Domains: ['hotel', 'restaurant'] Slots: {'hotel': {'type': 'guest house'}, 'restaurant': {'area': 'East', 'name': 'Royal Standard'}}
 Hello? Yes, I'm loo

In [37]:
#inputs = tokenizer.apply_chat_template(
#    messages,
#    add_generation_prompt=True,
#    tokenize=True,
#    return_dict=True,
#    return_tensors="pt",
#).to(model.device)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
print("Ground truth: ", sample["slots"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {'hotel': {'pricerange': 'expensive', 'type': 'hotel', 'internet': 'free', 'parking': 'lots'}}
 Hello? Yes, I'm looking for the
Ground truth:  {'hotel': {'internet': 'yes', 'parking': 'yes', 'pricerange': 'expensive', 'type': 'hotel'}}
