In [62]:
import os
import json
from glob import glob
from pathlib import Path

In [63]:
TESTSET_DIR_CANDIDATES = ["testsets", "data/testset"]
RESULTS_DIR = "results"
OUTPUT_DIR = "combined_metrics"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Acceptable filename variants
METRICS_CANDIDATES = ["metrics_detail.jsonl", "metrics_details.jsonl"]
FCONS_CANDIDATES   = ["fconsistency_detail.jsonl", "fconsistency_details.jsonl"]

In [64]:
DATASET_KEYWORDS = {
    "truthful": ["truthful", "truthfulqa", "truthful_qa"],
    "fin":      ["finqa", "fin_qa", "fin"],
    "med":      ["medqa", "med_qa", "med"],
    "mixed":    ["mixedqa", "mixed_qa", "mixed"],
}

def list_testset_files():
    files = []
    for d in TESTSET_DIR_CANDIDATES:
        if os.path.isdir(d):
            files.extend(glob(os.path.join(d, "*.jsonl")))
    return sorted(set(files))

def dataset_key_from_testset_stem(stem: str) -> str | None:
    s = stem.lower()
    for key, kws in DATASET_KEYWORDS.items():
        if any(kw in s for kw in kws):
            return key
    return None

def get_ids_from_testset(path):
    ids = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            rid = obj.get("id")
            if rid:
                ids.add(rid)
    return list(ids)

def path_matches_keywords(p: Path, kws: list[str]) -> bool:
    s = str(p.as_posix()).lower()
    return any(kw in s for kw in kws)

def find_first_existing_under(root: Path, candidates: list[str], kws: list[str]) -> list[Path]:
    """
    Return all candidate files under `root` or one-level subdirs whose path contains any keyword in kws.
    We allow multiple matches (e.g., metrics for multiple runs of the same dataset).
    """
    found: list[Path] = []

    # direct files under root that match keywords
    if path_matches_keywords(root, kws):
        for name in candidates:
            p = root / name
            if p.exists():
                found.append(p)

    # one level deeper, only subfolders whose path matches keywords
    for sub in root.iterdir():
        if sub.is_dir() and path_matches_keywords(sub, kws):
            for name in candidates:
                p = sub / name
                if p.exists():
                    found.append(p)

    return found

def read_jsonl_indexed_by_id(path):
    idx = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            rid = obj.get("id")
            if rid:
                idx[rid] = obj
    return idx

def collect_for_model(test_ids, model_dir: Path, dataset_kws: list[str]):
    """
    Only consumes detail files whose path contains dataset_kws.
    Returns dict: id -> { filename: {...}, filename: {...} }
    """
    out = {rid: {} for rid in test_ids}

    metric_files = find_first_existing_under(model_dir, METRICS_CANDIDATES, dataset_kws)
    fcons_files  = find_first_existing_under(model_dir, FCONS_CANDIDATES, dataset_kws)

    # merge across possibly multiple files (e.g., several runs)
    for mfile in metric_files:
        m_idx = read_jsonl_indexed_by_id(mfile)
        for rid in test_ids:
            if rid in m_idx:
                out[rid][mfile.name] = m_idx[rid]

    for ffile in fcons_files:
        f_idx = read_jsonl_indexed_by_id(ffile)
        for rid in test_ids:
            if rid in f_idx:
                out[rid][ffile.name] = f_idx[rid]

    # prune ids that remained empty
    return {rid: payload for rid, payload in out.items() if payload}

def main():
    testset_files = list_testset_files()
    if not testset_files:
        print("No testset files found in:", TESTSET_DIR_CANDIDATES)
        return

    results_root = Path(RESULTS_DIR)
    if not results_root.is_dir():
        print(f"Results dir not found: {RESULTS_DIR}")
        return

    model_dirs = [p for p in results_root.iterdir() if p.is_dir()]
    if not model_dirs:
        print(f"No model folders under {RESULTS_DIR}")
        return

    for testset_path in testset_files:
        tpath = Path(testset_path)
        stem = tpath.stem
        key = dataset_key_from_testset_stem(stem)
        if not key:
            print(f"⏭️  Skipping testset (no dataset key recognized): {tpath}")
            continue

        dataset_kws = DATASET_KEYWORDS[key]
        test_ids = get_ids_from_testset(tpath)
        if not test_ids:
            print(f"[WARN] No IDs in testset: {tpath}")
            continue

        for model_dir in model_dirs:
            model_name = model_dir.name
            merged = collect_for_model(test_ids, model_dir, dataset_kws)
            if not merged:
                print(f"⏭️  Skipping empty: {model_name}_{stem}.json (no {key} matches)")
                continue

            out_path = Path(OUTPUT_DIR) / f"{model_name}_{stem}.json"
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(merged, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved: {out_path}")

if __name__ == "__main__":
    main()

No testset files found in: ['testsets', 'data/testset']


In [None]:
import os
import json
from glob import glob

In [None]:


testsets_dir = 'data/testsets'
results_dir = 'results'
metrics_files = ['metrics_detail.jsonl', 'fconsistency_detail.jsonl']
output_dir = 'output_metrics'
os.makedirs(output_dir, exist_ok=True)

task_map = {
    'truthful_samples': 'truthful_qa',
    'fin_samples': 'fin_qa',
    'med_sample': 'med_qa',
    'mixed_sample': 'mixed_qa',
   
}

testset_files = glob(os.path.join(testsets_dir, '*.jsonl'))
for testset_file in testset_files:
    base = os.path.splitext(os.path.basename(testset_file))[0]
    if base not in task_map:
        continue     # skip anything nonstandard—e.g. refs, subset, etc.
    task_key = task_map[base]
    ids = set()
    with open(testset_file) as f:
        for line in f:
            ids.add(json.loads(line)['id'])
    for model_folder in os.listdir(results_dir):
        if not os.path.isdir(os.path.join(results_dir, model_folder)):
            continue
        if task_key not in model_folder:
            continue   # skip all unrelated result folders
        model_path = os.path.join(results_dir, model_folder)
        id_metrics = {id_: {} for id_ in ids}
        for mf in metrics_files:
            file_path = os.path.join(model_path, mf)
            if not os.path.exists(file_path):
                continue
            with open(file_path) as f:
                for line in f:
                    entry = json.loads(line)
                    entry_id = entry.get('id')
                    if entry_id in id_metrics:
                        id_metrics[entry_id][mf] = entry
        output_path = os.path.join(output_dir, f"{model_folder}__{base}_metrics.json")
        # Only save the file if you actually found metrics
        if any(id_metrics.values()):
            with open(output_path, 'w') as out_file:
                json.dump(id_metrics, out_file, indent=2)
            print(f"Saved: {output_path}")


Saved: output_metrics/gpt-4.1-mini-med_qa__med_sample_metrics.json
Saved: output_metrics/llama3.2:1b-med_qa__med_sample_metrics.json
Saved: output_metrics/llama3:8b-med_qa__med_sample_metrics.json
Saved: output_metrics/llama3.2:1b-fin_qa__fin_samples_metrics.json
Saved: output_metrics/llama3:8b-fin_qa__fin_samples_metrics.json
Saved: output_metrics/gpt-4.1-mini-fin_qa__fin_samples_metrics.json
Saved: output_metrics/gpt-4.1-mini-truthful_qa__truthful_samples_metrics.json
Saved: output_metrics/llama3.2:1b-truthful_qa__truthful_samples_metrics.json
Saved: output_metrics/llama3:8b-truthful_qa__truthful_samples_metrics.json
Saved: output_metrics/gpt-4.1-mini-truthful_qa-highmed__truthful_samples_metrics.json
Saved: output_metrics/llama3.2:1b-mixed_qa__mixed_sample_metrics.json
Saved: output_metrics/gpt-4.1-mini-mixed_qa__mixed_sample_metrics.json
Saved: output_metrics/llama3:8b-mixed_qa__mixed_sample_metrics.json
