# Compare Base vs Fine-Tuned Model

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd
from datasets import load_dataset
from functions import run_inference, calculate_metrics, confusion_matrix


### load data

In [2]:
# load data from HF hub
ds = load_dataset("shawhin/HDFS_v1_blocks")

In [11]:
num_test = 50
test_sample = ds["test"].shuffle(seed=0).select(range(num_test))

# check class distribution in samples
test_positive = sum(test_sample["label"])

print(f"Test: {test_positive} anomalous, {len(test_sample) - test_positive} normal")

Test: 2 anomalous, 48 normal


### response format

In [4]:
# define structured output schema
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "anomaly_flag",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "Anomalous": {
                    "type": "boolean",
                    "description": "True if the log block is anomalous, false otherwise."
                },
                "reasoning": {
                    "type": "string",
                    "description": "Explanation of the decision regarding whether the block is anomalous."
                }
            },
            "required": ["Anomalous", "reasoning"],
            "additionalProperties": False
        }
    }
}

### evaluate base model

In [12]:
base_model = "o4-mini-2025-04-16"

# run base model on validation set
test_labels = [bool(ex["label"]) for ex in test_sample]

base_predictions = run_inference(base_model, test_sample, response_format)
base_metrics = calculate_metrics(base_predictions, test_labels)

print(f"Base model ({base_model}) metrics:")
for metric, value in base_metrics.items():
    print(f"  {metric}: {value:.4f}")

Running o4-mini-2025-04-16: 100%|██████████| 50/50 [04:29<00:00,  5.39s/it]

Base model (o4-mini-2025-04-16) metrics:
  accuracy: 0.6600
  precision: 0.0000
  recall: 0.0000
  f1: 0.0000





### evaluate fine-tuned model

In [None]:
ft_model = ""

# run fine-tuned model on validation set
ft_predictions = run_inference(ft_model, test_sample, response_format)
ft_metrics = calculate_metrics(ft_predictions, test_labels)

print(f"Fine-tuned model ({ft_model}) metrics:")
for metric, value in ft_metrics.items():
    print(f"  {metric}: {value:.4f}")

### compare models

In [None]:
# compare base vs fine-tuned model
comparison = pd.DataFrame({
    "Base Model": base_metrics,
    "Fine-tuned Model": ft_metrics
}).T

# add improvement row
improvement = {k: ft_metrics[k] - base_metrics[k] for k in base_metrics}
comparison.loc["Improvement"] = improvement

print("Model Comparison:")
print(comparison.round(4).to_string())

print("\nConfusion Matrices:")
print(f"\nBase Model:\n{confusion_matrix(test_labels, base_predictions)}")
print(f"\nFine-tuned Model:\n{confusion_matrix(test_labels, ft_predictions)}")