# Compare Base vs Fine-Tuned Model

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from functions import run_inference, calculate_metrics, confusion_matrix

### load data

In [2]:
# load data from HF hub
ds = load_dataset("shawhin/HDFS_v1_blocks")

In [3]:
num_test = 50

# Split testing data by class
test_anomalous = ds["test"].filter(lambda x: x["label"] == 1).shuffle(seed=42)
test_normal = ds["test"].filter(lambda x: x["label"] == 0).shuffle(seed=42)

# Balanced 90-10 split for testing
test_sample = concatenate_datasets([
    test_anomalous.select(range(int(num_test * 0.1))),
    test_normal.select(range(int(num_test * 0.9)))
]).shuffle(seed=42)

# compute test set distribution
test_positive = sum(test_sample["label"])

print(f"Test: {test_positive} anomalous, {len(test_sample) - test_positive} normal")

Test: 5 anomalous, 45 normal


### response format

In [4]:
# define structured output schema
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "anomaly_flag",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "Anomalous": {
                    "type": "boolean",
                    "description": "True if the log block is anomalous, false otherwise."
                },
                "reasoning": {
                    "type": "string",
                    "description": "Explanation of the decision regarding whether the block is anomalous."
                }
            },
            "required": ["Anomalous", "reasoning"],
            "additionalProperties": False
        }
    }
}

### evaluate base model

In [5]:
base_model = "o4-mini-2025-04-16"

# run base model on validation set
test_labels = [bool(ex["label"]) for ex in test_sample]

base_results = run_inference(base_model, test_sample, response_format)
base_predictions = [r["Anomalous"] for r in base_results]  # Extract booleans
base_metrics = calculate_metrics(base_predictions, test_labels)

print(f"Base model ({base_model}) metrics:")
for metric, value in base_metrics.items():
    print(f"  {metric}: {value:.4f}")

Running o4-mini-2025-04-16: 100%|███████████████| 50/50 [06:28<00:00,  7.77s/it]

Base model (o4-mini-2025-04-16) metrics:
  accuracy: 0.7800
  precision: 0.2000
  recall: 0.4000
  f1: 0.2667





### evaluate fine-tuned model

In [6]:
ft_model = "ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2XE1CWu"

# run fine-tuned model on validation set
ft_results = run_inference(ft_model, test_sample, response_format)
ft_predictions = [r["Anomalous"] for r in ft_results]  # Extract booleans
ft_metrics = calculate_metrics(ft_predictions, test_labels)

print(f"Fine-tuned model ({ft_model}) metrics:")
for metric, value in ft_metrics.items():
    print(f"  {metric}: {value:.4f}")

Running ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2

Fine-tuned model (ft:o4-mini-2025-04-16:shawhin-talebi-ventures-llc:hdfs-classification:D2XE1CWu) metrics:
  accuracy: 0.9200
  precision: 0.5714
  recall: 0.8000
  f1: 0.6667





### compare models

In [7]:
# compare base vs fine-tuned model
comparison = pd.DataFrame({
    "Base Model": base_metrics,
    "Fine-tuned Model": ft_metrics
}).T

# add improvement row
improvement = {k: ft_metrics[k] - base_metrics[k] for k in base_metrics}
comparison.loc["Improvement"] = improvement

print("Model Comparison:")
print(comparison.round(4).to_string())

print("\nConfusion Matrices:")
print(f"\nBase Model:\n{confusion_matrix(test_labels, base_predictions)}")
print(f"\nFine-tuned Model:\n{confusion_matrix(test_labels, ft_predictions)}")

Model Comparison:
                  accuracy  precision  recall      f1
Base Model            0.78     0.2000     0.4  0.2667
Fine-tuned Model      0.92     0.5714     0.8  0.6667
Improvement           0.14     0.3714     0.4  0.4000

Confusion Matrices:

Base Model:
[[37  8]
 [ 3  2]]

Fine-tuned Model:
[[42  3]
 [ 1  4]]


### save evaluation results

In [8]:
# Create evaluation results DataFrame
results_data = []
for i, ex in enumerate(test_sample):
    results_data.append({
        "block_id": ex["block_id"],
        "block_content": ex["text"],
        "baseline_reasoning": base_results[i]["reasoning"],
        "baseline_prediction": base_results[i]["Anomalous"],
        "ft_reasoning": ft_results[i]["reasoning"],
        "ft_prediction": ft_results[i]["Anomalous"],
        "ground_truth": bool(ex["label"])
    })

results_df = pd.DataFrame(results_data)
results_df.to_csv("data/eval_results.csv", index=False)
print(f"Saved {len(results_df)} evaluation results to data/eval_results.csv")

Saved 50 evaluation results to data/eval_results.csv
