**Goal**: run multiple experiments with different YAML configs & log results.

Cell 1 – imports & helpers

In [None]:
import sys
from pathlib import Path
import json

import yaml
import numpy as np

PROJECT_ROOT = Path().resolve().parents[0]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data.load import load_all_sources, add_broad_category
from src.models.classifier import train_text_classifier
from src.models.evaluation import evaluate_classifier

EXPERIMENTS_DIR = Path('experiments/results')
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)


Cell 2 – config loader and experiment function

In [None]:
def load_config(path: Path) -> dict:
    with path.open('r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def run_experiment(config_path: Path, experiment_name: str) -> dict:
    cfg = load_config(config_path)
    print(f'Running experiment: {experiment_name}')
    print(f'Using config: {config_path}')

    df = load_all_sources(cfg)
    df = add_broad_category(df, cfg)

    # (optional) filter to categories with enough samples
    min_samples = cfg['categories'].get('min_samples_per_class', 0)
    if min_samples > 0:
        counts = df['broad_category'].value_counts()
        keep_cats = counts[counts >= min_samples].index.tolist()
        df = df[df['broad_category'].isin(keep_cats)]
        print('Kept categories:', keep_cats)

    model, X_train, X_test, y_train, y_test, _ = train_text_classifier(df, cfg)

    from sklearn.metrics import classification_report
    y_pred = model.predict(X_test)

    metrics = evaluate_classifier(y_test, y_pred)
    print('Accuracy:', metrics['accuracy'])
    print('Macro F1:', metrics['macro_f1'])

    # Save metrics to JSON
    result = {
        'experiment_name': experiment_name,
        'config_path': str(config_path),
        'accuracy': metrics['accuracy'],
        'macro_f1': metrics['macro_f1'],
        'labels': metrics['labels'],
        'classification_report': metrics['classification_report'],
    }

    out_path = EXPERIMENTS_DIR / f'{experiment_name}.json'
    with out_path.open('w', encoding='utf-8') as f:
        json.dump(result, f, indent=2)

    return result


Cell 3 – run a few experiments

In [None]:
results = []

results.append(
    run_experiment(
        Path('configs/base.yaml'),
        experiment_name='exp_tfidf_logreg_bart',
    )
)

# If you create configs/model_svm.yaml (copy of base with SVM):
results.append(
    run_experiment(
        Path('configs/model_svm.yaml'),
        experiment_name='exp_tfidf_svm_bart',
    )
)

# If you create configs/model_nb.yaml:
results.append(
    run_experiment(
        Path('configs/model_nb.yaml'),
        experiment_name='exp_tfidf_nb_bart',
    )
)

results


Cell 4 – compare experiments

In [None]:
import pandas as pd

df_results = pd.DataFrame(
    [
        {
            'experiment_name': r['experiment_name'],
            'accuracy': r['accuracy'],
            'macro_f1': r['macro_f1'],
        }
        for r in results
    ],
)
df_results


Cell 1 – imports & helpers

Cell 1 – imports & helpers