# Install dependencies and dataset

In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast
import json
import numpy as np
import pickle
import os
import heapq

In [2]:
from huggingface_hub import login
login("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")

In [3]:
dataset = load_dataset("pirxus/spokenwoz-whisper")
dataset
dataset = dataset.remove_columns("audio")

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
print(dataset)
dataset['train'][5]

DatasetDict({
    train: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 73950
    })
    dev: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 9104
    })
    test: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 17652
    })
})


{'wav_id': 'MUL0003',
 'turn_index': 10,
 'text': 'I will go there on Friday.',
 'agent_text': 'and most of the time do like to arrive there.',
 'domains': "['restaurant']",
 'slots': "{'restaurant': {'day': 'Friday', 'people': '1', 'area': 'centre', 'food': 'Indian'}}",
 'context': {'turn_index': [0, 2, 4, 6, 8],
  'text': ['Hello, is this Customer Service Center?',
   "Well, I'm looking for a place to dine. Do you have any recommendation for me?",
   "Well, I'd like to stay in the place with convenient transportation.",
   "I'd like to try some Indian food.",
   'Yes, please. Book a table for one people.'],
  'agent_text': ['Yes, this is Cosmos Service Center. How may I help?',
   'Of course, there are plenty of restaurants. Do you have any specific area?',
   'Okay, do you have any specific food type?',
   'Let me check it for you. Yes, we got you a panel located in the center of the city, could meet your requirement. Do you want to book table?',
   'Okay, and what day would you lik

# Sentence embedder

In [5]:
embedder = SentenceTransformer('sergioburdisso/dialog2flow-joint-bert-base')

In [6]:
def embed_dataset(embedder, dataset, path, batch_size=128):
    #If the dataset was already embedded, load it from a file to save time
    if os.path.exists(path):
        with open(path, "rb") as f:
            data = pickle.load(f)
        return data
    embedded_rows = [
    {
        'wav_id': sample['wav_id'],
        'turn_index': sample['turn_index'],
        'embedding': embedding
    }
    for batch_start in range(0, len(dataset), batch_size)
    for sample, embedding in zip(
        dataset[batch_start:batch_start+batch_size],
        embedder.encode(
            [json.dumps(sample) for sample in dataset[batch_start:batch_start+batch_size]],
            show_progress_bar=False
        )
    )]
    with open(path, "wb") as f:
        pickle.dump(embedded_rows, f)
    return embedded_rows 

embedded_train = embed_dataset(embedder, list(dataset['train'])[:512], "embedded_train.dataset")

In [7]:
#Verifying that the embedded dataset is correct and well sorted
def sample_distance(sample, embedded_sample, embedder):
    return np.linalg.norm(embedded_sample['embedding'] - embedder.encode(json.dumps(sample)))

#Verifying that the embedded dataset is correct and well sorted
print(sample_distance(dataset['train'][120], embedded_train[120], embedder))


0.0


In [8]:
def find_k_most_similar(sample, k, embedder, dataset, embedded_dataset):
    sample_emb = embedder.encode(json.dumps(sample))
    
    M = np.stack([row['embedding'] for row in embedded_dataset])
    s = np.array(sample_emb)
    
    dists = np.linalg.norm(M - s, axis=1) # L2 distance
    dists[dists == 0] = np.inf # Ignore exact match

    idx = np.argpartition(dists, k)[:k]
    idx = idx[np.argsort(dists[idx])]

    samples   = [dataset[i] for i in idx]
    distances = dists[idx].tolist()
    return samples, distances

#find_k_most_similar(dataset['train'][120], 8, embedder, dataset['train'], embedded_train)

# Prompt creation

In [33]:
def create_prompt(sample, k, only_user=False):
    (demonstrations, _) = find_k_most_similar(sample, k, embedder, dataset['train'], embedded_train)
    prompt = 'Instruction: Identify the slot keys and values.'
    for i in range(k):
        for j in range(len(demonstrations[i]["context"]["text"])):
            prompt += " User: " + demonstrations[i]["context"]["text"][j]
            if not only_user:
                prompt += " Agent: " + demonstrations[i]["context"]["agent_text"][j]
        prompt += " User: " + demonstrations[i]["text"]
        prompt += " Domains: " + demonstrations[i]["domains"]
        prompt += " Slots: " + demonstrations[i]["slots"]
    #New sample
    for j in range(len(sample["context"]["text"])):
        prompt += " User: " + sample["context"]["text"][j]
        if not only_user:
            prompt += " Agent: " + sample["context"]["agent_text"][j]
    prompt += " User: " + sample["text"]
    prompt += " Domains: " + sample["domains"]
    prompt += " Slots: {"
    return prompt
    
#create_prompt(dataset['train'][120], 3)   

# LLM

In [21]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")
model = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [40]:
prompt = create_prompt(dataset['train'][100], 3, True)
print(prompt)

Instruction: Identify the slot keys and values. User: Hello, I'm from the University of Cambridge. User: Yes, I'm looking for information in Cambridge. User: Well, I'm looking for a place to stay. User: Well, the hotel should include free Wi-Fi. Domains: ['hotel'] Slots: {'hotel': {'internet': 'yes'}} User: Hello? User: Can you please assist me in looking for a train, please? User: It should go to Stansted Airport and it should leave from Cambridge. User: Yes, can you please make it for boys to leave after 9.45pm and it should be on Friday. User: That should be on Friday. User: Okay. No problem at all. User: Could you please provide me with the price, the train ID, and the travel time? User: The train IE and also the travel time. User: Okay, thank you very much for that. User: Yes, could you also assist me in looking for a place to stay, please? User: It should basically have some free spacing for cars and also should be in the west and relatively have a star rating of 4 and lastly it 

In [42]:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

output = model.generate(
    input_ids=input_ids,
    max_new_tokens=50,
    do_sample=True,
    top_k=100,
    top_p=0.5
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Instruction: Identify the slot keys and values. User: Hello, I'm from the University of Cambridge. User: Yes, I'm looking for information in Cambridge. User: Well, I'm looking for a place to stay. User: Well, the hotel should include free Wi-Fi. Domains: ['hotel'] Slots: {'hotel': {'internet': 'yes'}} User: Hello? User: Can you please assist me in looking for a train, please? User: It should go to Stansted Airport and it should leave from Cambridge. User: Yes, can you please make it for boys to leave after 9.45pm and it should be on Friday. User: That should be on Friday. User: Okay. No problem at all. User: Could you please provide me with the price, the train ID, and the travel time? User: The train IE and also the travel time. User: Okay, thank you very much for that. User: Yes, could you also assist me in looking for a place to stay, please? User: It should basically have some free spacing for cars and also should be in the west and relatively have a star rating of 4 and lastly it 