In [8]:
!pip install transformers accelerate torch datasets sentence_transformers --upgrade

Collecting sentence_transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Using cached scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence_transformers)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Using cached scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

# Checking locally trained model

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import json

model_id = "./tinyllama-1.1b-lora-final"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

system_prompt = """You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed with the information provided only.
The current time and date is 08:12 AM on Thursday March 14, 2024
Services: light.turn_off(), light.turn_on(brightness,rgb_color), fan.turn_on(), fan.turn_off()
Devices:
light.office 'Office Light' = on;80%
fan.office 'Office fan' = off
light.kitchen 'Kitchen Light' = on;80%;red
light.bedroom 'Bedroom Light' = off
"""

user_message = "Turn off the kitchen light"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_message}
]

# Prepare chat-formatted prompt
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate model output
outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Keep only assistant's message
if "assistant" in decoded:
    decoded = decoded.split("assistant", 1)[-1].strip()

# Extract JSON block
json_match = re.search(r"```homeassistant\s*({.*})\s*```", decoded, re.DOTALL)
json_block = json_match.group(1) if json_match else None

# Print final clean output
print("Assistant says:")
if json_match:
    natural_text = decoded.split("```homeassistant")[0].strip()
    print(natural_text)
    print("\nHome Assistant Command:")
    print(json_block)
else:
    print(decoded)


Assistant says:
|>
Turning off the kitchen light.

Home Assistant Command:
{"service": "light.turn_off", "target_device": "light.kitchen"}


# Evaluation pipeline

In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import re

# CONFIG
MODEL_NAME = "./tinyllama-1.1b-lora-final"  
DATASET_NAME = "acon96/Home-Assistant-Requests"  
SPLIT = "test"  
MAX_SAMPLES = 50  

# Load dataset from Hugging Face
dataset = load_dataset(DATASET_NAME, split=SPLIT)

if MAX_SAMPLES:
    dataset = dataset.select(range(MAX_SAMPLES))

# Load model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# Helper: Extract system/user/assistant messages
def extract_messages(entry):
    """Extract system, user, assistant messages from list-of-dicts dataset format"""
    system_prompt, user_message, assistant_message = "", "", ""
    if isinstance(entry, list):
        for msg in entry:
            if msg["from"] == "system":
                system_prompt = msg["value"]
            elif msg["from"] == "user":
                user_message = msg["value"]
            elif msg["from"] == "assistant":
                assistant_message = msg["value"]
    return system_prompt, user_message, assistant_message

# Helper: Extract JSON from text
def extract_json(text):
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            return None
    return None

# Helper: Extract entity from JSON
def extract_entity(json_obj):
    if not json_obj:
        return None
    # Try "target_device" key
    if "target_device" in json_obj:
        return json_obj["target_device"]
    # Or nested inside target
    if "target" in json_obj and "entity_id" in json_obj["target"]:
        return json_obj["target"]["entity_id"]
    return None

# Evaluation loop
exact_json_matches = 0
exact_text_matches = 0
entity_matches = 0

for example in dataset:
    # Extract conversation messages
    #system_prompt, user_message, assistant_message = extract_messages(example["messages"])
    system_prompt, user_message, assistant_message = extract_messages(example)

    # Prepare prompt for model
    prompt = f"{system_prompt}\nUser: {user_message}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate model output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=256)
    model_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON objects
    reference_json = extract_json(assistant_message)
    predicted_json = extract_json(model_response)

    # JSON exact match
    if reference_json == predicted_json:
        exact_json_matches += 1

    # Text exact match (ignores JSON)
    if assistant_message.strip() == model_response.strip():
        exact_text_matches += 1

    # Entity-level match
    if extract_entity(reference_json) == extract_entity(predicted_json):
        entity_matches += 1

# Compute metrics
total = len(dataset)
print(f"Total samples evaluated: {total}")
print(f"Exact JSON match accuracy: {exact_json_matches / total:.2%}")
print(f"Exact text match accuracy: {exact_text_matches / total:.2%}")
print(f"Entity-level accuracy: {entity_matches / total:.2%}")


`torch_dtype` is deprecated! Use `dtype` instead!


Total samples evaluated: 50
Exact JSON match accuracy: 100.00%
Exact text match accuracy: 0.00%
Entity-level accuracy: 100.00%


In [3]:
print(dataset[0])

{'conversations': [{'from': 'system', 'value': "You are 'Al', a helpful AI Assistant that controls the devices in a house. Complete the following task as instructed or answer the following question with the information provided only.\nServices: cover.close_cover(), cover.open_cover(), cover.stop_cover(), cover.toggle(), climate.set_fan_mode(fan_mode), climate.set_humidity(humidity), climate.set_hvac_mode(), climate.set_preset_mode(), climate.set_temperature(temperature), climate.toggle(), climate.turn_off(), climate.turn_on(), fan.decrease_speed(), fan.increase_speed(), fan.toggle(), fan.turn_off(), fan.turn_on(), light.toggle(), light.turn_off(), light.turn_on(rgb_color,brightness), lock.lock(), lock.unlock(), media_player.media_next_track(), media_player.media_pause(), media_player.media_play(), media_player.media_play_pause(), media_player.media_previous_track(), media_player.media_stop(), media_player.toggle(), media_player.turn_off(), media_player.turn_on(), media_player.volume_do

# Evaluation Pipeline - Detailed version

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import json
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

In [2]:
# Load model & tokenizer 
model_id = "./tinyllama-1.1b-lora-final"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


In [9]:
# Load dataset & truncate
dataset = load_dataset("acon96/Home-Assistant-Requests", split="test[10:25]")

In [7]:
# Extract messages
def extract_messages(example):
    """
    Extract system, user, and assistant messages from dataset example['conversations'].
    """
    system_prompt = ""
    user_message = ""
    assistant_message = ""

    for m in example["conversations"]:
        if m["from"] == "system":
            system_prompt = m["value"]
        elif m["from"] == "user":
            user_message = m["value"]
        elif m["from"] == "assistant":
            assistant_message = m["value"]

    return system_prompt, user_message, assistant_message


# HELPERS 
def extract_codeblock_json(text):
    """Extract JSON from a triple-backtick code block, ignoring language name."""
    if not text:
        return None
    codeblock_match = re.search(r"```(?:\w+)?\s*([\s\S]*?)```", text)
    if codeblock_match:
        candidate = codeblock_match.group(1).strip()
        try:
            return json.loads(candidate)
        except json.JSONDecodeError:
            return None
    return None

def extract_entity_from_json(j):
    """Extract first entity (target_device) from JSON dict."""
    if isinstance(j, dict):
        if "target_device" in j:
            return j["target_device"]
        # Also check nested keys
        for v in j.values():
            if isinstance(v, dict):
                ent = extract_entity_from_json(v)
                if ent:
                    return ent
    return None


# Extract JSON block from model output
def extract_json_block(text):
    match = re.search(r"```homeassistant\s*({.*})\s*```", text, re.DOTALL)
    return match.group(1).strip() if match else None

## Semantic Textual Similarity

In [10]:
# Load a pre-trained model for semantic textual similarity
sts_model = SentenceTransformer('all-MiniLM-L6-v2')

# Similarity evaluation
def semantic_similarity(ref_text, pred_text):
    # Generate embeddings for both sentences
    emb_ref = sts_model.encode(ref_text, convert_to_tensor=True)
    emb_pred = sts_model.encode(pred_text, convert_to_tensor=True)
    # Compute cosine similarity - ranges from -1 (opposite) to 1 (identical)
    score = util.pytorch_cos_sim(emb_ref, emb_pred).item()
    return score


# Counters for accuracy
total_samples = len(dataset)
json_correct = 0
entity_correct = 0

print("\n=== Sample-wise Evaluation ===\n")
for idx, example in enumerate(dataset):
    system_prompt, user_message, reference_text = extract_messages(example)
    ref_json = extract_codeblock_json(reference_text)
    ref_entity = extract_entity_from_json(ref_json) if ref_json else None

    # Build chat prompt
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Keep only assistant's part if needed
    if "assistant" in decoded:
        decoded = decoded.split("assistant", 1)[-1].strip()

    # Extract predicted JSON
    pred_json = extract_json_block(decoded)

    # Extract predicted entity
    pred_entity = None
    if pred_json:
        try:
            pred_dict = json.loads(pred_json)
            pred_entity = pred_dict.get("target_device", None)
        except json.JSONDecodeError:
            pred_entity = None

    pred_text = decoded.split("```")
    # Fix: ensure pred_text is a single string
    pred_text_str = pred_text[0].strip() if isinstance(pred_text, list) else str(pred_text).strip()

    # Compare results
    json_match_flag = (pred_json == json.dumps(ref_json)) if ref_json else False
    entity_match_flag = (pred_entity == ref_entity) if ref_entity else False

    # Update counters
    if json_match_flag:
        json_correct += 1
    if entity_match_flag:
        entity_correct += 1

    # Per-sample print
    print(f"Sample {idx} [ENTITY:{'✓' if entity_match_flag else '✗'}]")
    print(f" User: {user_message}")
    print(f" Ref text: {reference_text}")
    print(f" Pred text: {pred_text}")
    print(f" Ref json: {ref_json} | Pred json: {pred_json}")
    print(f" Ref entity: {ref_entity} | Pred entity: {pred_entity}")
    print("Similarity evaluation\n")
    similarity_score = semantic_similarity(reference_text.strip(), pred_text_str)
    print(f"Semantic similarity: {similarity_score:.2f}")
    print("-" * 60)


# Final accuracy results
json_accuracy = json_correct / total_samples * 100
entity_accuracy = entity_correct / total_samples * 100

print("\n=== Overall Accuracy ===")
#print(f"JSON Accuracy: {json_correct}/{total_samples} ({json_accuracy:.2f}%)")
print(f"Entity Accuracy: {entity_correct}/{total_samples} ({entity_accuracy:.2f}%)")


=== Sample-wise Evaluation ===

Sample 0 [ENTITY:✓]
 User: toggle the basement fan
 Ref text: flipping Basement state for you
```homeassistant
{"service": "fan.toggle", "target_device": "fan.basement"}
```
 Pred text: ["|>\nswitching the fan's state as requested.\n", 'homeassistant\n{"service": "fan.toggle", "target_device": "fan.basement"}\n', '']
 Ref json: {'service': 'fan.toggle', 'target_device': 'fan.basement'} | Pred json: {"service": "fan.toggle", "target_device": "fan.basement"}
 Ref entity: fan.basement | Pred entity: fan.basement
Similarity evaluation

Semantic similarity: 0.51
------------------------------------------------------------
Sample 1 [ENTITY:✓]
 User: toggle the bedroom fan
 Ref text: flipping Bedroom state for you
```homeassistant
{"service": "fan.toggle", "target_device": "fan.bedroom"}
```
 Pred text: ['|>\ntoggling the fan now.\n', 'homeassistant\n{"service": "fan.toggle", "target_device": "fan.bedroom"}\n', '']
 Ref json: {'service': 'fan.toggle', 'target_

## Evaluation using BERTScore

In [11]:
!pip install -q bert-score accelerate

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import json
from datasets import load_dataset
from bert_score import score

# Load model & tokenizer 
model_id = "./tinyllama-1.1b-lora-final"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load dataset & truncate
dataset = load_dataset("acon96/Home-Assistant-Requests", split="test[50:65]")

# Extract messages
def extract_messages(example):
    system_prompt, user_message, assistant_message = "", "", ""
    for m in example["conversations"]:
        if m["from"] == "system":
            system_prompt = m["value"]
        elif m["from"] == "user":
            user_message = m["value"]
        elif m["from"] == "assistant":
            assistant_message = m["value"]
    return system_prompt, user_message, assistant_message

# HELPERS 
def extract_codeblock_json(text):
    if not text:
        return None
    codeblock_match = re.search(r"```(?:\w+)?\s*([\s\S]*?)```", text)
    if codeblock_match:
        candidate = codeblock_match.group(1).strip()
        try:
            return json.loads(candidate)
        except json.JSONDecodeError:
            return None
    return None

def extract_entity_from_json(j):
    if isinstance(j, dict):
        if "target_device" in j:
            return j["target_device"]
        for v in j.values():
            if isinstance(v, dict):
                ent = extract_entity_from_json(v)
                if ent:
                    return ent
    return None

def extract_json_block(text):
    match = re.search(r"```homeassistant\s*({.*})\s*```", text, re.DOTALL)
    return match.group(1).strip() if match else None

# Counters for accuracy
total_samples = len(dataset)
json_correct = 0
entity_correct = 0

# Collect refs & preds for BERTScore
all_refs = []
all_preds = []

print("\n=== Sample-wise Evaluation ===\n")
for idx, example in enumerate(dataset):
    system_prompt, user_message, reference_text = extract_messages(example)
    ref_json = extract_codeblock_json(reference_text)
    ref_entity = extract_entity_from_json(ref_json) if ref_json else None

    # Build chat prompt
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "assistant" in decoded:
        decoded = decoded.split("assistant", 1)[-1].strip()

    # Extract predicted JSON
    pred_json = extract_json_block(decoded)

    pred_entity = None
    if pred_json:
        try:
            pred_dict = json.loads(pred_json)
            pred_entity = pred_dict.get("target_device", None)
        except json.JSONDecodeError:
            pred_entity = None

    # Text outside JSON (natural language response)
    pred_text = decoded.split("```")
    pred_text_str = pred_text[0].strip() if isinstance(pred_text, list) else str(pred_text).strip()

    # Compare results
    json_match_flag = (pred_json == json.dumps(ref_json)) if ref_json else False
    entity_match_flag = (pred_entity == ref_entity) if ref_entity else False

    if json_match_flag:
        json_correct += 1
    if entity_match_flag:
        entity_correct += 1

    # Collect for BERTScore
    all_refs.append(reference_text.strip())
    all_preds.append(pred_text_str)

    print(f"Sample {idx} [ENTITY:{'✓' if entity_match_flag else '✗'}]")
    print(f" User: {user_message}")
    print(f" Ref text: {reference_text}")
    print(f" Pred text: {pred_text_str}")
    print(f" Ref json: {ref_json} | Pred json: {pred_json}")
    print(f" Ref entity: {ref_entity} | Pred entity: {pred_entity}")
    print("-" * 60)

# === Final metrics ===
json_accuracy = json_correct / total_samples * 100
entity_accuracy = entity_correct / total_samples * 100

print("\n=== Overall Accuracy ===")
print(f"Entity Accuracy: {entity_correct}/{total_samples} ({entity_accuracy:.2f}%)")

# === BERTScore ===
P, R, F1 = score(all_preds, all_refs, lang="en", verbose=True)
print("\n=== BERTScore Results ===")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1 Score:  {F1.mean().item():.4f}")



=== Sample-wise Evaluation ===

Sample 0 [ENTITY:✓]
 User: please lock the guest room door.
 Ref text: locking Guest Room Door
```homeassistant
{"service": "lock.lock", "target_device": "lock.guest_room_door"}
```
 Pred text: |>
locking Guest Room Door
 Ref json: {'service': 'lock.lock', 'target_device': 'lock.guest_room_door'} | Pred json: {"service": "lock.lock", "target_device": "lock.guest_room_door"}
 Ref entity: lock.guest_room_door | Pred entity: lock.guest_room_door
------------------------------------------------------------
Sample 1 [ENTITY:✓]
 User: attic door needs locking.
 Ref text: locking the door for you.
```homeassistant
{"service": "lock.lock", "target_device": "lock.attic_door"}
```
 Pred text: |>
attic door lock is now in process of being locked
 Ref json: {'service': 'lock.lock', 'target_device': 'lock.attic_door'} | Pred json: {"service": "lock.lock", "target_device": "lock.attic_door"}
 Ref entity: lock.attic_door | Pred entity: lock.attic_door
----------------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.12 seconds, 127.78 sentences/sec

=== BERTScore Results ===
Precision: 0.8575
Recall:    0.7897
F1 Score:  0.8221
