# Compare Base vs Fine-Tuned Model

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from functions import run_inference, calculate_metrics, confusion_matrix


### load data

In [2]:
# load data from HF hub
ds = load_dataset("shawhin/HDFS_v1_blocks")

In [3]:
num_test = 50
test_sample = ds["test"].shuffle(seed=0).select(range(num_test))

# check class distribution in samples
test_positive = sum(test_sample["label"])

print(f"Test: {test_positive} anomalous, {len(test_sample) - test_positive} normal")

Test: 2 anomalous, 48 normal


In [4]:
# Split training data by class
test_anomalous = ds["train"].filter(lambda x: x["label"] == 1).shuffle(seed=42)
test_normal = ds["train"].filter(lambda x: x["label"] == 0).shuffle(seed=42)

# Balanced 50-50 split for training
test_sample = concatenate_datasets([
    test_anomalous.select(range(int(num_test * 0.1))),
    test_normal.select(range(int(num_test * 0.9)))
]).shuffle(seed=42)

Filter:   0%|          | 0/460048 [00:00<?, ? examples/s]

Filter:   0%|          | 0/460048 [00:00<?, ? examples/s]

### response format

In [5]:
# define structured output schema
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "anomaly_flag",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "Anomalous": {
                    "type": "boolean",
                    "description": "True if the log block is anomalous, false otherwise."
                },
                "reasoning": {
                    "type": "string",
                    "description": "Explanation of the decision regarding whether the block is anomalous."
                }
            },
            "required": ["Anomalous", "reasoning"],
            "additionalProperties": False
        }
    }
}

### evaluate base model

In [6]:
base_model = "o4-mini-2025-04-16"

# run base model on validation set
test_labels = [bool(ex["label"]) for ex in test_sample]

base_predictions = run_inference(base_model, test_sample, response_format)
base_metrics = calculate_metrics(base_predictions, test_labels)

print(f"Base model ({base_model}) metrics:")
for metric, value in base_metrics.items():
    print(f"  {metric}: {value:.4f}")

Running o4-mini-2025-04-16: 100%|██████████| 50/50 [05:10<00:00,  6.21s/it]

Base model (o4-mini-2025-04-16) metrics:
  accuracy: 0.7600
  precision: 0.1818
  recall: 0.4000
  f1: 0.2500





### evaluate fine-tuned model

In [7]:
ft_model = "ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2XE1CWu"

# run fine-tuned model on validation set
ft_predictions = run_inference(ft_model, test_sample, response_format)
ft_metrics = calculate_metrics(ft_predictions, test_labels)

print(f"Fine-tuned model ({ft_model}) metrics:")
for metric, value in ft_metrics.items():
    print(f"  {metric}: {value:.4f}")

Running ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2XE1CWu: 100%|██████████| 50/50 [05:00<00:00,  6.01s/it]

Fine-tuned model (ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2XE1CWu) metrics:
  accuracy: 0.8000
  precision: 0.2727
  recall: 0.6000
  f1: 0.3750





### compare models

In [8]:
# compare base vs fine-tuned model
comparison = pd.DataFrame({
    "Base Model": base_metrics,
    "Fine-tuned Model": ft_metrics
}).T

# add improvement row
improvement = {k: ft_metrics[k] - base_metrics[k] for k in base_metrics}
comparison.loc["Improvement"] = improvement

print("Model Comparison:")
print(comparison.round(4).to_string())

print("\nConfusion Matrices:")
print(f"\nBase Model:\n{confusion_matrix(test_labels, base_predictions)}")
print(f"\nFine-tuned Model:\n{confusion_matrix(test_labels, ft_predictions)}")

Model Comparison:
                  accuracy  precision  recall     f1
Base Model            0.76     0.1818     0.4  0.250
Fine-tuned Model      0.80     0.2727     0.6  0.375
Improvement           0.04     0.0909     0.2  0.125

Confusion Matrices:

Base Model:
[[36  9]
 [ 3  2]]

Fine-tuned Model:
[[37  8]
 [ 2  3]]
