In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT/"src"))

# Startup checks
from startup_checks import ensure_project_dirs, check_required_artifacts

ensure_project_dirs()
# tokenization_config.json, comparison artifacts and BERT/FinBERT artifacts are required
check_required_artifacts()

All required artifacts are found. Notebook is ready


In [2]:
from config import BERT_KEY, FINBERT_KEY, HF_MODELS
from analysis import load_model_artifacts, build_metrics_comparison_table, get_primary_score, get_confusion_matrix, top_confusions, get_split_scores
from artifacts_utils import load_tokenization_config, save_final_model_selection

from datetime import datetime

import pandas as pd

In [3]:
# Load artifacts for BERT and FinBert
bert_art = load_model_artifacts(BERT_KEY)
finbert_art = load_model_artifacts(FINBERT_KEY)

token_cfg = load_tokenization_config()

print("Tokenization config:")
display(token_cfg)

Tokenization config:


{'baseline_max_length': 96,
 'max_length': 96,
 'padding': 'max_length',
 'truncation': True,
 'selection_method': 'truncation_efficiency_tradeoff',
 'truncation_threshold_pct': 1.0}

In [4]:
# Build a comparison table for metrics
model_artifacts_map = {
    BERT_KEY: bert_art,
    FINBERT_KEY: finbert_art
}

metric_keys = ["eval_loss", "eval_accuracy", "eval_macro_f1", "eval_macro_precision", "eval_macro_recall"]

df_val_compare = build_metrics_comparison_table(
    model_artifacts_map=model_artifacts_map,
    split="val",
    metric_keys=metric_keys
)

df_test_compare = build_metrics_comparison_table(
    model_artifacts_map=model_artifacts_map,
    split="test",
    metric_keys=metric_keys
)

display(df_val_compare, df_test_compare)

Unnamed: 0,model,split,eval_loss,eval_accuracy,eval_macro_f1,eval_macro_precision,eval_macro_recall
0,bert-base,val,0.48184,0.796233,0.741575,0.735577,0.749504
1,finbert,val,0.42835,0.806507,0.775123,0.761411,0.811377


Unnamed: 0,model,split,eval_loss,eval_accuracy,eval_macro_f1,eval_macro_precision,eval_macro_recall
0,bert-base,test,0.438863,0.805128,0.752194,0.744269,0.763485
1,finbert,test,0.419923,0.803419,0.7675,0.755469,0.801047


In [5]:
# Choose winner based on macro F1-score
bert_val_f1 = get_primary_score(bert_art, "val")
finbert_val_f1 = get_primary_score(finbert_art, "val")

winner = BERT_KEY if bert_val_f1 >= finbert_val_f1 else FINBERT_KEY
print("BERT val macro_f1:", bert_val_f1)
print("FinBERT val macro_f1:", finbert_val_f1)
print("Selected model (by val):", winner)

print("\nTest macro_f1 (report only):")
print("BERT test macro_f1:", get_primary_score(bert_art, "test"))
print("FinBERT test macro_f1:", get_primary_score(finbert_art, "test"))

delta = bert_val_f1 - finbert_val_f1
print("\nDelta (BERT - FinBERT) val macro_f1:", delta)

BERT val macro_f1: 0.7415754421031572
FinBERT val macro_f1: 0.7751226453612198
Selected model (by val): finbert

Test macro_f1 (report only):
BERT test macro_f1: 0.7521942203266261
FinBERT test macro_f1: 0.7675000443474712

Delta (BERT - FinBERT) val macro_f1: -0.0335472032580626


In [6]:
# Sanity check for saved evaluation artifacts
bert_label_order = bert_art["label_map"]["label_order"]
finbert_label_order = finbert_art["label_map"]["label_order"]

print("Label order (BERT):   ", bert_label_order)
print("Label order (FinBERT):", finbert_label_order)

assert bert_label_order == finbert_label_order, "Label order mismatch between models"

bert_num_labels = len(bert_label_order)
finbert_num_labels = len(finbert_label_order)

bert_cm = get_confusion_matrix(bert_art, "test")
finbert_cm = get_confusion_matrix(finbert_art, "test")

bert_cm_normalize = bert_art["evaluation"]["test"]["metadata"]["confusion_matrix_normalize"]
finbert_cm_normalize = finbert_art["evaluation"]["test"]["metadata"]["confusion_matrix_normalize"]

print("\nCM normalize (BERT):", bert_cm_normalize)
print("CM normalize (FinBERT):", finbert_cm_normalize)

assert bert_cm_normalize == finbert_cm_normalize, "CM normalize mode mismatch between models"

print("\nConfusion matrix (BERT):\n", bert_cm)
print("Confusion matrix (FinBERT):\n", finbert_cm)

assert bert_cm.shape == (bert_num_labels, bert_num_labels), f"Unexpected BERT CM shape: {bert_cm.shape}"
assert finbert_cm.shape == (finbert_num_labels, finbert_num_labels), f"Unexpected FinBERT CM shape: {finbert_cm.shape}"

Label order (BERT):    ['negative', 'neutral', 'positive']
Label order (FinBERT): ['negative', 'neutral', 'positive']

CM normalize (BERT): true
CM normalize (FinBERT): true

Confusion matrix (BERT):
 [[0.59302326 0.19767442 0.20930233]
 [0.1086262  0.82108626 0.07028754]
 [0.05913978 0.06451613 0.87634409]]
Confusion matrix (FinBERT):
 [[0.75581395 0.05813953 0.18604651]
 [0.17252396 0.77635783 0.05111821]
 [0.05376344 0.07526882 0.87096774]]


In [7]:
# Error analysis
bert_top = top_confusions(bert_art["predictions"]["test"])
finbert_top = top_confusions(finbert_art["predictions"]["test"])

print("BERT top confusions")
display(bert_top)
    
print("\nFinBERT top confusions")
display(finbert_top)

BERT top confusions


Unnamed: 0,y_true_label,y_pred_label,count
0,neutral,negative,34
1,neutral,positive,22
2,negative,positive,18
3,negative,neutral,17
4,positive,neutral,12
5,positive,negative,11



FinBERT top confusions


Unnamed: 0,y_true_label,y_pred_label,count
0,neutral,negative,54
1,negative,positive,16
2,neutral,positive,16
3,positive,neutral,14
4,positive,negative,10
5,negative,neutral,5


In [8]:
# Decision summary
bert_test_f1 = get_primary_score(bert_art, "test")
finbert_test_f1 = get_primary_score(finbert_art, "test")

print("=== Model Selection Summary ===")
print(f"Selection rule: choose by validation macro_f1 (to avoid test leakage)")
print(f"Winner: {winner}")

print("\nvalidation macro_f1:")
print(f"- BERT:    {bert_val_f1:.4f}")
print(f"- FinBERT: {finbert_val_f1:.4f}")
print(f"- Delta (BERT - FinBERT): {(delta):.4f}")

print("\ntest macro_f1 (report only):")
print(f"- BERT:    {bert_test_f1:.4f}")
print(f"- FinBERT: {finbert_test_f1:.4f}")

=== Model Selection Summary ===
Selection rule: choose by validation macro_f1 (to avoid test leakage)
Winner: finbert

validation macro_f1:
- BERT:    0.7416
- FinBERT: 0.7751
- Delta (BERT - FinBERT): -0.0335

test macro_f1 (report only):
- BERT:    0.7522
- FinBERT: 0.7675


In [9]:
# Save final model selection
selection = {
    "selected_model": {
        "model_key": winner,
        "model_id": HF_MODELS[winner]["model_id"]
    },
    "selection_criteria": {
        "primary_metric": "macro_f1",
        "split": "val",
        "decision_rule": "max",
        "implicit_decision_rule": {
            "type": "prefer_model",
            "model_key": "bert-base",
            "when": "score_tie"            
        }
    },
    "scores": {
        BERT_KEY: get_split_scores(bert_art),
        FINBERT_KEY: get_split_scores(finbert_art)
    },
    "environment": {
        "tokenization_config": load_tokenization_config(),
        BERT_KEY: bert_art["label_map"],
        FINBERT_KEY: finbert_art["label_map"],
    },
    "metadata": {
        "created_at": datetime.now().isoformat(),
        "pipeline_stage": "model_selection"
    },
    "notes": "Final model selected based on validation macro F1-score"
}

final_model_selection_path = save_final_model_selection(selection)
print("Saved final model selection to:", final_model_selection_path)

Saved to /home/tl/stock-news-sentiment-bert-finbert/artifacts/results/comparison/final_model_selection.json
Saved final model selection to: /home/tl/stock-news-sentiment-bert-finbert/artifacts/results/comparison/final_model_selection.json


## Summary - Model Comparison and Selection (BERT vs FinBERT)

### Objective
To compare the Transformer models (BERT vs FinBERT) using a consistent evaluation framework, select a final model based on validation performance, and perform lightweight error analysis to support the decision.

### Work Performed
- Built side-by-side comparison tables for key validation and test metrics
- Selected a “winner” model using **validation macro F1** as the primary decision score (test set used for reporting only)
- Performed sanity checks on saved evaluation artifacts (label order alignment, confusion matrix shape, normalization mode)
- Conducted error analysis by extracting the most frequent misclassification pairs from test predictions

### Key Decisions
- Chose the final model using validation macro F1 to avoid test leakage
- Standardized comparison inputs by reusing the same tokenization config and artifact-loading mechanisms for both models
- Used confusion matrices and “top confusions” as interpretability checks, not as primary selection criteria

### Results
- Identified a selected model (“winner”) based on validation macro F1-score
- Produced interpretable error patterns (most common label confusions) to highlight model weaknesses

### Artifacts Produced
- Final model selection:
  - **artifacts/result/comparison/final_model_selection.json**

### Artifacts Used
- BERT artifacts:
  - **artifacts/results/bert-base/**
  - **artifacts/models/best/bert-base/**
- FinBERT artifacts:
  - **artifacts/results/finbert/**
  - **artifacts/models/best/finbert/**
- Shared preprocessing config:
  - **artifacts/preprocessing/tokenization_config.json**
- Split artifacts (per model):
  - Label map: **label_map.json**
  - Best model info: **best_model_info.json**
  - Metrics: **val_metrics.json**, **test_metrics.json**
  - Structured evaluation: **val_evaluation.json**, **test_evaluation.json**
  - Predictions: **val_prediction.csv**, **test_prediction.csv**
  - Training log: **training_log_history.csv**

### Takeaway
This notebook completes the model comparison and selection stage by choosing the final model using validation macro F1-score, supported by artifact sanity checks and lightweight error analysis for interpretability.