# Install dependencies and dataset

In [34]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast
import json
import numpy as np
import pickle
import os
import heapq

In [2]:
from huggingface_hub import login
login("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")

In [3]:
dataset = load_dataset("pirxus/spokenwoz-whisper")
dataset
dataset = dataset.remove_columns("audio")

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
print(dataset)
dataset['train'][5]

DatasetDict({
    train: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 73950
    })
    dev: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 9104
    })
    test: Dataset({
        features: ['wav_id', 'turn_index', 'text', 'agent_text', 'domains', 'slots', 'context'],
        num_rows: 17652
    })
})


{'wav_id': 'MUL0003',
 'turn_index': 10,
 'text': 'I will go there on Friday.',
 'agent_text': 'and most of the time do like to arrive there.',
 'domains': "['restaurant']",
 'slots': "{'restaurant': {'day': 'Friday', 'people': '1', 'area': 'centre', 'food': 'Indian'}}",
 'context': {'turn_index': [0, 2, 4, 6, 8],
  'text': ['Hello, is this Customer Service Center?',
   "Well, I'm looking for a place to dine. Do you have any recommendation for me?",
   "Well, I'd like to stay in the place with convenient transportation.",
   "I'd like to try some Indian food.",
   'Yes, please. Book a table for one people.'],
  'agent_text': ['Yes, this is Cosmos Service Center. How may I help?',
   'Of course, there are plenty of restaurants. Do you have any specific area?',
   'Okay, do you have any specific food type?',
   'Let me check it for you. Yes, we got you a panel located in the center of the city, could meet your requirement. Do you want to book table?',
   'Okay, and what day would you lik

# Sentence embedder

In [5]:
embedder = SentenceTransformer('sergioburdisso/dialog2flow-joint-bert-base')

In [22]:
def embed_dataset(embedder, dataset, path, batch_size=128):
    #If the dataset was already embedded, load it from a file to save time
    if os.path.exists(path):
        with open(path, "rb") as f:
            data = pickle.load(f)
        return data
    embedded_rows = [
    {
        'wav_id': sample['wav_id'],
        'turn_index': sample['turn_index'],
        'embedding': embedding
    }
    for batch_start in range(0, len(dataset), batch_size)
    for sample, embedding in zip(
        dataset[batch_start:batch_start+batch_size],
        embedder.encode(
            [json.dumps(sample) for sample in dataset[batch_start:batch_start+batch_size]],
            show_progress_bar=False
        )
    )]
    with open(path, "wb") as f:
        pickle.dump(embedded_rows, f)
    return embedded_rows 

embedded_train = embed_dataset(embedder, list(dataset['train'])[:512], "embedded_train.dataset")

In [26]:
#Verifying that the embedded dataset is correct and well sorted
def sample_distance(sample, embedded_sample, embedder):
    return np.linalg.norm(embedded_sample['embedding'] - embedder.encode(json.dumps(sample)))

#Verifying that the embedded dataset is correct and well sorted
print(sample_distance(dataset['train'][120], embedded_train[120], embedder))


0.0


In [47]:
def find_k_most_similar(sample, k, embedder, dataset, embedded_dataset):
    sample_emb = embedder.encode(json.dumps(sample))
    
    M = np.stack([row['embedding'] for row in embedded_dataset])
    s = np.array(sample_emb)
    
    dists = np.linalg.norm(M - s, axis=1) # L2 distance
    dists[dists == 0] = np.inf # Ignore exact match

    idx = np.argpartition(dists, k)[:k]
    idx = idx[np.argsort(dists[idx])]

    samples   = [dataset[i] for i in idx]
    distances = dists[idx].tolist()
    return samples, distances

#find_k_most_similar(dataset['train'][120], 8, embedder, dataset['train'], embedded_train)

([{'wav_id': 'MUL0006',
   'turn_index': 14,
   'text': 'My phone number is 842-',
   'agent_text': 'Okay.',
   'domains': "['hotel', 'profile', 'restaurant']",
   'slots': "{'hotel': {'name': 'Acorn Guest House'}, 'profile': {'name': 'Milan Cruz'}, 'restaurant': {'day': 'Tuesday', 'people': '5', 'time': '13:45', 'name': 'Pipasha Restaurant'}}",
   'context': {'turn_index': [0, 2, 4, 6, 8, 10, 12],
    'text': ["Hi, I'm looking for a restaurant called the Pipacha Restaurant.",
     'Today is Monday. Can you book a table for five persons at 1.45 p.m. tomorrow?',
     'My name is Milan Cruz, M-I-L-A-N-C-R-U-Z.',
     "No, that's fine.",
     "Also, I'm looking for a hotel called Atcorn Guesthouse.",
     'I want to know the area and also the hotel type and price range.',
     "Oh, that's great. Also, I need a taxi between these two places. I need a taxi to arrive at the restaurant by the book time."],
    'agent_text': ["The Pipa restaurant? Yes, I'm checking for you. And I have found th

# Prompt creation

In [None]:
def create_prompt(sample, k, only_user=False):
    (demonstrations, _) = find_k_most_similar(sample, k, embedder, dataset['train'], embedded_train)
    prompt = 'Instruction: Identify the slot value.\n'
    for i in k:
        for j in range(len(demonstrations[i].context.text))
            prompt += "User: " + demonstrations[i].context.text[j]
            prompt += "Agent: " + demonstrations[i].context.agent_text[j]
    return prompt
    
create_prompt(dataset['train'][120], 3)
create_prompt(dataset['train'][120], 3, True)    

# LLM

In [6]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")
model = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [7]:
prompt = 'Instruction: Identify the slot value. User: Can you help me get a taxi to Pizza Hut Fen Ditton? Agent: Sure. Where do you want to depart from? User: I want to depart from Sidney, Sussex College, also I need a reservation there. Domain: ["taxi", "restaurant"] Slots: {"taxi": {"arriveBy": "not mentioned", "departure": "sidney sussex college", "destination": "pizza hut fenditton", "leaveAt": "not mentioned"}, "restaurant": {"area": "centre", "day": "not mentioned", "food": "not mentioned", "name": "not mentioned", "people": "not mentioned", "pricerange": "expensive", "time": "not mentioned"}} User: I would like a taxi from Saint John’s College to Pizza Hut Fen Ditton. Domain: [“taxi"] Slots: {“arriveBy":'

In [8]:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

output = model.generate(
    input_ids=input_ids,
    max_new_tokens=20,
    do_sample=True,
    top_k=50,
    top_p=0.95
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Instruction: Identify the slot value. User: Can you help me get a taxi to Pizza Hut Fen Ditton? Agent: Sure. Where do you want to depart from? User: I want to depart from Sidney, Sussex College, also I need a reservation there. Domain: ["taxi", "restaurant"] Slots: {"taxi": {"arriveBy": "not mentioned", "departure": "sidney sussex college", "destination": "pizza hut fenditton", "leaveAt": "not mentioned"}, "restaurant": {"area": "centre", "day": "not mentioned", "food": "not mentioned", "name": "not mentioned", "people": "not mentioned", "pricerange": "expensive", "time": "not mentioned"}} User: I would like a taxi from Saint John’s College to Pizza Hut Fen Ditton. Domain: [“taxi"] Slots: {“arriveBy": "sidney sussex college", "depart
