In [1]:
!pip install torch transformers datasets peft accelerate bitsandbytes scikit-learn tqdm trl # Tejaaa yeh bhi daalna tha lol

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.48.2 trl-0.25.1


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
import os
import json
from tqdm import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import DataLoader


def load_tool_calling_dataset(dataset_name="Salesforce/xlam-function-calling-60k", split="train", start=None, end=None) -> Dataset:
    """
    Loads a tool-calling dataset from the Hugging Face Hub.
    """
    try:
        dataset = load_dataset(dataset_name, split=split)
        if start is not None and end is not None:
            dataset = dataset.select(range(start, end))
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def preprocess_data(examples, tokenizer):
    """
    Preprocesses the data into a structured prompt format for LFM2.
    LFM2 uses a simple chat template with <|im_start|> and <|im_end|> tokens.
    """
    processed_examples = {"text": []}
    for i in range(len(examples['query'])):
        query = examples['query'][i]
        tools = json.loads(examples['tools'][i])
        answers = json.loads(examples['answers'][i])

        # Create a simplified tool definition
        tool_defs = []
        for tool in tools:
            func_name = tool.get('name')
            params = tool.get('parameters', {}).get('properties', {})
            param_defs = [f"{pname}: {p.get('type') if isinstance(p, dict) else p}" for pname, p in params.items()]
            tool_defs.append(f"def {func_name}({', '.join(param_defs)}):")

        tool_defs_str = "\n".join(tool_defs)

        # Create the tool calls
        tool_calls = []
        for answer in answers:
            func_name = answer.get('name')
            args = answer.get('arguments', {})
            tool_calls.append(f"{func_name}(**{json.dumps(args)})")

        tool_calls_str = "\n".join(tool_calls)

        # Build the prompt using LFM2 chat template
        # LFM2 uses <|im_start|> and <|im_end|> tokens similar to ChatML format
        prompt = (
            f"<|im_start|>system\n"
            f"You are an expert AI assistant with access to a suite of tools. Use them to answer the user's question.\n"
            f"Available tools:\n{tool_defs_str}<|im_end|>\n"
            f"<|im_start|>user\n"
            f"{query}<|im_end|>\n"
            f"<|im_start|>assistant\n"
            f"<tool_code>\n{tool_calls_str}<|im_end|>"
        )
        processed_examples["text"].append(prompt)

    return processed_examples

def tokenize_function(examples, tokenizer, max_length=1024):
    """
    Tokenize the text examples.
    """
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors=None
    )
    # Set labels to be the same as input_ids for causal LM
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

def collate_fn(batch):
    """
    Custom collate function to prepare batches.
    """
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

def train_epoch(model, dataloader, optimizer, scheduler, device, gradient_accumulation_steps=2):
    """
    Train for one epoch.
    """
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    progress_bar = tqdm(dataloader, desc="Training")

    for step, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        # Check for NaN/Inf loss immediately
        if torch.isnan(outputs.loss) or torch.isinf(outputs.loss):
            print(f"Detected NaN/Inf loss at step {step}. Skipping step.")
            optimizer.zero_grad()
            continue

        loss = outputs.loss / gradient_accumulation_steps
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights every gradient_accumulation_steps
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # Update progress bar
        if (step + 1) % 10 == 0:
            progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})

    # Calculate average loss
    if len(dataloader) > 0:
        return total_loss / len(dataloader)
    return 0.0

def main():
    # Set CUDA device
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Model and tokenizer names
    model_name = "LiquidAI/LFM2-350M"

    # Hyperparameters
    num_epochs = 1
    batch_size = 2
    gradient_accumulation_steps = 2
    learning_rate = 3e-4
    warmup_steps = 100
    max_length = 1024

    # Load dataset
    print("Loading dataset...")
    dataset = load_tool_calling_dataset()
    if not dataset:
        print("Failed to load dataset. Exiting.")
        return

    # Use a subset for faster training
    dataset = dataset.select(range(6000))
    print(f"Dataset size: {len(dataset)}")

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Set padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = 'left'

    # Preprocess the dataset
    print("Preprocessing dataset...")
    processed_dataset = dataset.map(
        lambda examples: preprocess_data(examples, tokenizer),
        batched=True,
        remove_columns=dataset.column_names
    )

    # Tokenize the dataset
    print("Tokenizing dataset...")
    tokenized_dataset = processed_dataset.map(
        lambda examples: tokenize_function(examples, tokenizer, max_length=max_length),
        batched=True,
        remove_columns=processed_dataset.column_names
    )

    # Set format for PyTorch
    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # Create DataLoader
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        pin_memory=True
    )

    print(f"Number of batches: {len(dataloader)}")

    # Load model with quantization
    print("Loading model...")

    use_quantization = True
    try:
        from transformers import BitsAndBytesConfig as BnBConfig
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True
        )
        print("Using 4-bit quantization with torch.bfloat16 compute dtype...")
    except ImportError:
        print("bitsandbytes not available, loading model in float16...")
        use_quantization = False
        quantization_config = None

    if use_quantization:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            trust_remote_code=True,
            device_map="auto"
        )
        model = prepare_model_for_kbit_training(model)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
            device_map="auto"
        )

    model.gradient_checkpointing_enable()

    # LoRA configuration for LFM2
    # LFM2 uses a different architecture, so we target appropriate modules
    print("Applying LoRA...")
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        # Target modules for LFM2 architecture
        # LFM2 typically has q_proj, k_proj, v_proj, o_proj for attention
        # and gate_proj, up_proj, down_proj for FFN layers
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    # Calculate total training steps
    total_steps = len(dataloader) * num_epochs // gradient_accumulation_steps

    # Learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    # Training loop
    print("\nStarting training...")
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        avg_loss = train_epoch(model, dataloader, optimizer, scheduler, device, gradient_accumulation_steps)
        print(f"Average loss: {avg_loss:.4f}")

        # Save checkpoint after each epoch
        checkpoint_dir = f"./models/lfm2-350m-tool-calling/checkpoint-epoch-{epoch+1}"
        os.makedirs(checkpoint_dir, exist_ok=True)
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Checkpoint saved to {checkpoint_dir}")

    print("\nTraining finished.")

    # Save the final model
    final_dir = "./models/lfm2-350m-tool-calling-final"
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)
    print(f"Final model saved to {final_dir}")

if __name__ == "__main__":
    main()

Using device: cuda
Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

xlam_function_calling_60k.json:   0%|          | 0.00/96.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Dataset size: 6000
Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Preprocessing dataset...


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Tokenizing dataset...


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Number of batches: 3000
Loading model...
Using 4-bit quantization with torch.bfloat16 compute dtype...


config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Applying LoRA...
trainable params: 491,520 || all params: 354,975,488 || trainable%: 0.1385

Starting training...

Epoch 1/1


Training:   0%|          | 0/3000 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Training: 100%|██████████| 3000/3000 [1:29:25<00:00,  1.79s/it, loss=12.8]


Average loss: 6.5394
Checkpoint saved to ./models/lfm2-350m-tool-calling/checkpoint-epoch-1

Training finished.
Final model saved to ./models/lfm2-350m-tool-calling-final


In [4]:
!zip -r /content/LFM_FT.zip /content/models

  adding: content/models/ (stored 0%)
  adding: content/models/lfm2-350m-tool-calling/ (stored 0%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/ (stored 0%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/README.md (deflated 65%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/adapter_model.safetensors (deflated 7%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/special_tokens_map.json (deflated 72%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/adapter_config.json (deflated 58%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/chat_template.jinja (deflated 71%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/tokenizer.json (deflated 81%)
  adding: content/models/lfm2-350m-tool-calling/checkpoint-epoch-1/tokenizer_config.json (deflated 96%)
  adding: content/models/lfm2-350m-tool-calling-final/ (stored 0%)
  adding: content/models/lfm2-350m-tool-call

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import json
from tqdm import tqdm
import re
import gc
import os
from datasets import load_dataset, Dataset
import logging
from datetime import datetime
import numpy as np

# --- CONFIGURATION & LOGGING ---
RESULTS_FILE = "evaluation_results_lfm2.json"
LOG_FILE = f"eval_log_lfm2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(LOG_FILE),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

# --- HELPER FUNCTIONS ---

def load_tool_calling_dataset(dataset_name="Salesforce/xlam-function-calling-60k", split="train", start=None, end=None) -> Dataset:
    try:
        dataset = load_dataset(dataset_name, split=split)
        if start is not None and end is not None:
            dataset = dataset.select(range(start, end))
        return dataset
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        return None

def format_prompt_for_eval(sample):
    """
    Format prompt using LFM2's ChatML-style template with <|im_start|> and <|im_end|> tokens.
    """
    query = sample['query']
    tools = json.loads(sample['tools']) if isinstance(sample['tools'], str) else sample['tools']

    tool_defs = []
    for tool in tools:
        func_name = tool.get('name')
        params = tool.get('parameters', {}).get('properties', {})
        param_defs = [f"{pname}: {p.get('type') if isinstance(p, dict) else p}" for pname, p in params.items()]
        tool_defs.append(f"def {func_name}({', '.join(param_defs)}):")

    tool_defs_str = "\n".join(tool_defs)

    prompt = (
        f"<|im_start|>system\n"
        f"You are an expert AI assistant with access to a suite of tools. Use them to answer the user's question.\n"
        f"Available tools:\n{tool_defs_str}<|im_end|>\n"
        f"<|im_start|>user\n"
        f"{query}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    return prompt

def parse_tool_calls(prediction_str: str):
    """
    Parse tool calls from LFM2 output.
    Handles both <|im_end|> and <tool_code> markers.
    """
    # Strip the end token
    prediction_str = prediction_str.split('<|im_end|>')[0].strip()

    # Extract tool code section if present
    if "<tool_code>" in prediction_str:
        prediction_str = prediction_str.split("<tool_code>")[-1].strip()

    pattern = re.compile(r"(\w+)\((.*?)\)")
    matches = pattern.findall(prediction_str)

    calls = []
    for match in matches:
        func_name = match[0]
        args_str = match[1].strip()

        # Skip Python built-ins
        if func_name in ['print', 'return', 'exit', 'assert']:
            continue

        try:
            if args_str.startswith('**'):
                args_str = args_str[2:].strip()

            args_dict = json.loads(args_str)
            calls.append({"name": func_name, "arguments": args_dict})

        except Exception:
            logger.warning(f"Failed to parse arguments for call: {func_name}({args_str}) - Skipping.")
            continue

    return calls

def calculate_metrics(predictions, ground_truths):
    """Calculates Exact Match, Call Precision, Call Recall, and Call F1"""
    gt_list = json.loads(ground_truths) if isinstance(ground_truths, str) else ground_truths

    pred_set = {json.dumps(p, sort_keys=True) for p in predictions}
    gt_set = {json.dumps(g, sort_keys=True) for g in gt_list}

    correct_calls = len(pred_set.intersection(gt_set))

    precision = correct_calls / len(pred_set) if len(pred_set) > 0 else 0
    recall = correct_calls / len(gt_set) if len(gt_set) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    exact_match = 1 if pred_set == gt_set and len(gt_set) > 0 else 0

    return precision, recall, f1, exact_match

def calculate_tool_select_metrics(predictions, ground_truths):
    """
    Calculates Tool Select F1 score based only on the set of function names used.
    """
    gt_list = json.loads(ground_truths) if isinstance(ground_truths, str) else ground_truths

    # Get the unique set of predicted and ground truth function names
    pred_tools = {p['name'] for p in predictions}
    gt_tools = {g['name'] for g in gt_list}

    # Calculate components
    correct_tool_selections = len(pred_tools.intersection(gt_tools))

    ts_precision = correct_tool_selections / len(pred_tools) if len(pred_tools) > 0 else 0
    ts_recall = correct_tool_selections / len(gt_tools) if len(gt_tools) > 0 else 0

    # Calculate Tool Select F1
    ts_f1 = 2 * (ts_precision * ts_recall) / (ts_precision + ts_recall) if (ts_precision + ts_recall) > 0 else 0

    return ts_f1

def calculate_hallucination_rate(predictions, available_tools):
    """Calculate the rate of hallucinated tool calls (non-existent tools or parameters)"""
    if not predictions:
        return 0

    available_tools_list = json.loads(available_tools) if isinstance(available_tools, str) else available_tools

    available_tool_map = {
        tool['name']: set(tool.get('parameters', {}).get('properties', {}).keys())
        for tool in available_tools_list
    }

    hallucinated_calls_count = 0

    for pred in predictions:
        if pred['name'] not in available_tool_map:
            hallucinated_calls_count += 1
            continue

        valid_args = available_tool_map[pred['name']]
        for arg_name in pred['arguments']:
            if arg_name not in valid_args:
                hallucinated_calls_count += 1
                break

    return hallucinated_calls_count / len(predictions)

def save_metrics_to_json(raw_metrics, total_samples, file_path=RESULTS_FILE):
    """Aggregates and saves all evaluation results to a JSON file."""

    final_results = {}

    for model_name, res_list in raw_metrics.items():
        num_valid_samples = len(res_list)

        if num_valid_samples == 0:
            agg_data = {
                "total_samples": total_samples,
                "valid_samples": 0,
                "error": "No valid predictions processed."
            }
        else:
            # Aggregate all raw metrics
            p_sum = sum(res['p'] for res in res_list)
            r_sum = sum(res['r'] for res in res_list)
            f1_sum = sum(res['f1'] for res in res_list)
            em_sum = sum(res['em'] for res in res_list)
            hr_sum = sum(res['hr'] for res in res_list)
            ts_f1_sum = sum(res['ts_f1'] for res in res_list)

            agg_data = {
                "total_samples": total_samples,
                "valid_samples": num_valid_samples,
                "avg_precision": p_sum / num_valid_samples,
                "avg_recall": r_sum / num_valid_samples,
                "avg_f1_score": f1_sum / num_valid_samples,
                "avg_tool_select_f1_score": ts_f1_sum / num_valid_samples,
                "exact_match_rate": em_sum / num_valid_samples,
                "avg_hallucination_rate": hr_sum / num_valid_samples,
                "individual_samples": res_list
            }

        final_results[model_name] = agg_data

    with open(file_path, 'w') as f:
        json.dump(final_results, f, indent=4)

    logger.info(f"All evaluation metrics saved to {file_path}")
    return final_results

# --- Main Evaluation Logic ---
def main():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    logger.info("Starting LFM2 evaluation script...")

    # Update model names for LFM2
    base_model_name = "LiquidAI/LFM2-350M"
    peft_model_path = "./models/lfm2-350m-tool-calling-final"

    # Load evaluation subset
    eval_dataset = load_tool_calling_dataset(split="train", start=6000, end=7000)
    if not eval_dataset:
        logger.error("Failed to load evaluation dataset. Exiting.")
        return

    logger.info(f"Loaded evaluation subset of size: {len(eval_dataset)}")
    total_samples = len(eval_dataset)

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    prompts = [format_prompt_for_eval(sample) for sample in eval_dataset]

    raw_metrics = {
        "fine_tuned": [],
        "base": []
    }

    generation_params = {
        "max_new_tokens": 128,
        "pad_token_id": tokenizer.pad_token_id,
        "batch_size": 4,
        "return_full_text": True,
        "do_sample": False
    }

    # --- Evaluate Fine-Tuned Model ---
    logger.info("Loading and evaluating fine-tuned LFM2 model...")
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto"
        )
        ft_model = PeftModel.from_pretrained(base_model, peft_model_path)
        ft_model = ft_model.merge_and_unload()

        ft_pipe = pipeline(
            "text-generation",
            model=ft_model,
            tokenizer=tokenizer,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )

        ft_outputs = ft_pipe(prompts, **generation_params)

        for i, sample in tqdm(enumerate(eval_dataset), desc="Metrics (Fine-Tuned)", total=total_samples):
            ground_truths = sample['answers']
            available_tools = sample['tools']

            ft_raw_output = ""
            if ft_outputs and len(ft_outputs[i]) > 0:
                ft_raw_output = ft_outputs[i][0].get('generated_text', '')

            if not ft_raw_output:
                logger.warning(f"Skipping fine-tuned sample {i} due to failed/empty generation.")
                continue

            # Extract the assistant's response after the last <|im_start|>assistant
            ft_pred_str = ft_raw_output.split('<|im_start|>assistant\n')[-1].strip()

            ft_preds = parse_tool_calls(ft_pred_str)

            # Calculate metrics
            ts_f1 = calculate_tool_select_metrics(ft_preds, ground_truths)
            p, r, f1, em = calculate_metrics(ft_preds, ground_truths)
            hr = calculate_hallucination_rate(ft_preds, available_tools)

            raw_metrics["fine_tuned"].append({'p': p, 'r': r, 'f1': f1, 'em': em, 'hr': hr, 'ts_f1': ts_f1})

    except Exception as e:
        logger.error(f"Error during Fine-Tuned Model evaluation: {e}")

    del base_model, ft_model, ft_pipe
    gc.collect()
    torch.cuda.empty_cache()
    logger.info("Finished fine-tuned model evaluation and cleared memory.")

    # --- Evaluate Base Model ---
    logger.info("\nLoading and evaluating base LFM2 model...")
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto"
        )
        base_pipe = pipeline(
            "text-generation",
            model=base_model,
            tokenizer=tokenizer,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )
        base_outputs = base_pipe(prompts, **generation_params)

        logger.setLevel(logging.CRITICAL) # Suppress warnings during fast TQDM loop

        for i, sample in tqdm(enumerate(eval_dataset), desc="Metrics (Base)", total=total_samples):
            ground_truths = sample['answers']
            available_tools = sample['tools']

            base_raw_output = ""
            if base_outputs and len(base_outputs[i]) > 0:
                base_raw_output = base_outputs[i][0].get('generated_text', '')

            if not base_raw_output:
                continue

            # Extract the assistant's response after the last <|im_start|>assistant
            base_pred_str = base_raw_output.split('<|im_start|>assistant\n')[-1].strip()

            base_preds = parse_tool_calls(base_pred_str)

            # Calculate metrics
            ts_f1 = calculate_tool_select_metrics(base_preds, ground_truths)
            p, r, f1, em = calculate_metrics(base_preds, ground_truths)
            hr = calculate_hallucination_rate(base_preds, available_tools)

            raw_metrics["base"].append({'p': p, 'r': r, 'f1': f1, 'em': em, 'hr': hr, 'ts_f1': ts_f1})

        logger.setLevel(logging.INFO) # Re-enable logging

    except Exception as e:
        logger.error(f"Error during Base Model evaluation: {e}")

    del base_model, base_pipe
    gc.collect()
    torch.cuda.empty_cache()
    logger.info("Finished base model evaluation and cleared memory.")

    # --- SAVE AND PRINT RESULTS ---

    # Save the raw metrics and aggregated results to JSON
    aggregated_results = save_metrics_to_json(raw_metrics, total_samples)

    # Print the final summary
    logger.info("\n--- LFM2 Evaluation Results Summary ---")
    print("\n--- LFM2 Evaluation Results Summary ---")

    for model_name, res_data in aggregated_results.items():

        if "error" in res_data:
            log_message = f"\nModel: {model_name.replace('_', ' ').title()} - {res_data['error']}"
        else:
            log_message = (
                f"\nModel: {model_name.replace('_', ' ').title()} (N={res_data['valid_samples']}/{res_data['total_samples']} total)\n"
                f"  Exact Match Rate: {res_data['exact_match_rate']:.4f}\n"
                f"  Tool Call F1: {res_data['avg_f1_score']:.4f}\n"
                f"  Tool Select F1: {res_data['avg_tool_select_f1_score']:.4f}\n"
                f"  Precision: {res_data['avg_precision']:.4f}\n"
                f"  Recall: {res_data['avg_recall']:.4f}\n"
                f"  Hallucination Rate: {res_data['avg_hallucination_rate']:.4f}"
            )

        logger.info(log_message)
        print(log_message)

if __name__ == "__main__":
    main()

INFO:__main__:Starting LFM2 evaluation script...
INFO:__main__:Loaded evaluation subset of size: 1000
INFO:__main__:Loading and evaluating fine-tuned LFM2 model...
Device set to use cuda:0
Metrics (Fine-Tuned):   1%|          | 9/1000 [00:00<00:00, 1945.51it/s]
ERROR:__main__:Error during Fine-Tuned Model evaluation: 'int' object is not iterable
INFO:__main__:Finished fine-tuned model evaluation and cleared memory.
INFO:__main__:
Loading and evaluating base LFM2 model...
Device set to use cuda:0
Metrics (Base):  45%|████▍     | 448/1000 [00:00<00:00, 8119.47it/s]



--- LFM2 Evaluation Results Summary ---

Model: Fine Tuned (N=9/1000 total)
  Exact Match Rate: 0.0000
  Tool Call F1: 0.0000
  Tool Select F1: 0.8852
  Precision: 0.0000
  Recall: 0.0000
  Hallucination Rate: 1.0000

Model: Base (N=448/1000 total)
  Exact Match Rate: 0.0000
  Tool Call F1: 0.0000
  Tool Select F1: 0.0022
  Precision: 0.0000
  Recall: 0.0000
  Hallucination Rate: 0.0112
