In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT/"src"))

# Startup checks
from startup_checks import ensure_project_dirs, check_required_artifacts, check_required_final_selection

ensure_project_dirs()
# tokenization_config.json, comparison artifacts and BERT/FinBERT artifacts are required
check_required_artifacts()

check_required_final_selection()

All required artifacts are found. Notebook is ready
Final model selection artifact found: /home/tl/stock-news-sentiment-bert-finbert/artifacts/results/comparison/final_model_selection.json


In [2]:
from config import BERT_KEY, FINBERT_KEY
from analysis import (
    load_model_artifacts, build_metrics_comparison_table, get_confusion_matrix, 
    top_confusions, classification_report_to_df
)
from artifacts_utils import load_final_model_selection, get_best_model_dir, load_tokenization_config
from inference import prepare_inference_context, predict

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import pandas as pd
import torch

In [3]:
# Load final model selection (winner)
final_selection = load_final_model_selection()

print("Final model selection:")
display(final_selection)
winner = final_selection["selected_model"]["model_key"]
winner_id = final_selection["selected_model"]["model_id"]
primary_metric = final_selection["selection_criteria"]["primary_metric"]
selection_split = final_selection["selection_criteria"]["split"]

print("\nWinner:", winner)
print("Winner HF model_id:", winner_id)
print("Primary metric:", primary_metric)
print("Selection split:", selection_split)

Final model selection:


{'selected_model': {'model_key': 'finbert', 'model_id': 'ProsusAI/finbert'},
 'selection_criteria': {'primary_metric': 'macro_f1',
  'split': 'val',
  'decision_rule': 'max',
  'implicit_decision_rule': {'type': 'prefer_model',
   'model_key': 'bert-base',
   'when': 'score_tie'}},
 'scores': {'bert-base': {'val': {'macro_f1': 0.7415754421031572,
    'weighted_f1': 0.7975689752727669},
   'test': {'macro_f1': 0.7521942203266261,
    'weighted_f1': 0.8079636171969878}},
  'finbert': {'val': {'macro_f1': 0.7751226453612198,
    'weighted_f1': 0.8142125444355713},
   'test': {'macro_f1': 0.7675000443474712,
    'weighted_f1': 0.8122094310469596}}},
 'environment': {'tokenization_config': {'baseline_max_length': 96,
   'max_length': 96,
   'padding': 'max_length',
   'truncation': True,
   'selection_method': 'truncation_efficiency_tradeoff',
   'truncation_threshold_pct': 1.0},
  'bert-base': {'label_order': ['negative', 'neutral', 'positive'],
   'label_to_id': {'negative': 0, 'neutral':


Winner: finbert
Winner HF model_id: ProsusAI/finbert
Primary metric: macro_f1
Selection split: val


In [4]:
# Load BERT and FinBERT artifacts
bert_art = load_model_artifacts(BERT_KEY)
finbert_art = load_model_artifacts(FINBERT_KEY)

model_artifacts_map = {
    BERT_KEY: bert_art,
    FINBERT_KEY: finbert_art
}

In [5]:
# Compare metrics table for macro and weighted
metric_keys = [
    "eval_loss", "eval_accuracy",
    "eval_macro_f1", "eval_weighted_f1",
    "eval_macro_precision", "eval_weighted_precision",
    "eval_macro_recall", "eval_weighted_recall"
]

df_val = build_metrics_comparison_table(
    model_artifacts_map=model_artifacts_map,
    split="val",
    metric_keys=metric_keys
)

df_test = build_metrics_comparison_table(
    model_artifacts_map=model_artifacts_map,
    split="test",
    metric_keys=metric_keys
)

print("Validation metrics comparison:")
display(df_val)

print("Test metrics comparison (report only):")
display(df_test)

Validation metrics comparison:


Unnamed: 0,model,split,eval_loss,eval_accuracy,eval_macro_f1,eval_weighted_f1,eval_macro_precision,eval_weighted_precision,eval_macro_recall,eval_weighted_recall
0,bert-base,val,0.48184,0.796233,0.741575,0.797569,0.735577,0.801038,0.749504,0.796233
1,finbert,val,0.42835,0.806507,0.775123,0.814213,0.761411,0.837464,0.811377,0.806507


Test metrics comparison (report only):


Unnamed: 0,model,split,eval_loss,eval_accuracy,eval_macro_f1,eval_weighted_f1,eval_macro_precision,eval_weighted_precision,eval_macro_recall,eval_weighted_recall
0,bert-base,test,0.438863,0.805128,0.752194,0.807964,0.744269,0.814187,0.763485,0.805128
1,finbert,test,0.419923,0.803419,0.7675,0.812209,0.755469,0.83582,0.801047,0.803419


In [6]:
# Per-class performance
winner_art = bert_art if winner == BERT_KEY else finbert_art

df_winner_test_cr = classification_report_to_df(winner_art, "test")

label_order = winner_art["label_map"]["label_order"]
custom_order = label_order + ["macro avg", "weighted avg"]
df_winner_test_cr["label"] = pd.Categorical(df_winner_test_cr["label"], categories=custom_order, ordered=True)

display(df_winner_test_cr.sort_values(["label"]).reset_index(drop=True))

Unnamed: 0,label,precision,recall,f1,support
0,negative,0.503876,0.755814,0.604651,86.0
1,neutral,0.927481,0.776358,0.845217,313.0
2,positive,0.835052,0.870968,0.852632,186.0
3,macro avg,0.755469,0.801047,0.7675,585.0
4,weighted avg,0.83582,0.803419,0.812209,585.0


In [7]:
# Confusion matrix and Top confusion
cm = get_confusion_matrix(winner_art, "test")
cm_norm = winner_art["evaluation"]["test"]["metadata"]["confusion_matrix_normalize"]

print("Winner CM normalize mode:", cm_norm)
print("Label order:", label_order)
print("Confusion matrix:\n", cm)
print("CM shape:", cm.shape)

df_top = top_confusions(winner_art["predictions"]["test"])
print("Top confusions (winner, test):")
display(df_top)

Winner CM normalize mode: true
Label order: ['negative', 'neutral', 'positive']
Confusion matrix:
 [[0.75581395 0.05813953 0.18604651]
 [0.17252396 0.77635783 0.05111821]
 [0.05376344 0.07526882 0.87096774]]
CM shape: (3, 3)
Top confusions (winner, test):


Unnamed: 0,y_true_label,y_pred_label,count
0,neutral,negative,54
1,negative,positive,16
2,neutral,positive,16
3,positive,neutral,14
4,positive,negative,10
5,negative,neutral,5


In [8]:
# Load best model from artifacts
best_dir = get_best_model_dir(winner)
print("Loading best model from:", best_dir)

tokenizer = AutoTokenizer.from_pretrained(best_dir, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(best_dir)

print("Loaded model config id2label:", model.config.id2label)
print("Loaded model config label2id:", model.config.label2id)

Loading best model from: /home/tl/stock-news-sentiment-bert-finbert/artifacts/models/best/finbert
Loaded model config id2label: {0: 'negative', 1: 'neutral', 2: 'positive'}
Loaded model config label2id: {'negative': 0, 'neutral': 1, 'positive': 2}


In [9]:
# Inference
token_cfg = load_tokenization_config()

texts = [
    "The company reported strong earnings and raised guidance.",
    "The outlook is uncertain and investors remain cautious.",
    "The stock is plunged after disappointing results."
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ctx = prepare_inference_context(
    model=model,
    device=device,
    max_length=token_cfg["max_length"]
)

df_inference = predict(
    texts,
    tokenizer=tokenizer,
    model=model,
    ctx=ctx
)

display(df_inference)
print("Using max_length:", token_cfg["max_length"])

Unnamed: 0,text,pred_id,pred_label,prob_negative,prob_neutral,prob_positive
0,The company reported strong earnings and raise...,2,positive,0.005459,0.02208,0.972461
1,The outlook is uncertain and investors remain ...,1,neutral,0.313208,0.664312,0.022481
2,The stock is plunged after disappointing results.,0,negative,0.633271,0.353327,0.013402


Using max_length: 96


## Summary - Final Model Inference

### Objective
To load the selected final model from persisted artifacts and demonstrate a clean, reproducible, and deployment-ready inference workflow, using the same preprocessing configuration and label mappings defined during training and model selection.

### Work Performed
- Prepared an explicit InferenceContext encapsulating to explicitly separate inference configuration from model logic
  - execution device (CPU / GPU)
  - maximum token length
  - validated id2label mapping
- Verified inference outputs using sample, unseen input texts

### Key Decisions
- Reused training-time artifacts (tokenization config, label map) to avoid configuration drift

### Results
- Successfully generated predictions and calibrated class probabilities for unseen texts

### Artifacts Used
- Final model selection:
  - **artifacts/results/comparison/final_model_selection.json**
- Best model checkpoint (winner):
  - **artifacts/models/best/winner_model_key/**
- Shared preprocessing config:
  - **artifacts/preprocessing/tokenization_config.json**
- Label mappings:
  - **artifacts/preprocessing/winner_model_key/label_map.json**

### Takeaway
This notebook completes the end-to-end pipeline by demonstrating how a trained and selected Transformer model can be safely and consistently used for inference.
The design emphasizes artifact-driven reproducibility.