# Quantify all -100 chosen_labels

This notebook scans `dpo_batch_debug.jsonl` and measures how often `chosen_labels` are entirely `-100`.


In [None]:
from pathlib import Path
import json

file_path = Path("dpo_batch_debug.jsonl")
if not file_path.exists():
    matches = [p for p in Path(".").rglob("dpo_batch_debug.jsonl") if p.is_file()]
    if matches:
        file_path = matches[0]
        print(f"Using {file_path}")
    else:
        raise FileNotFoundError("dpo_batch_debug.jsonl not found. Set file_path to the correct location.")


In [None]:
records = []
with file_path.open() as handle:
    for line_num, line in enumerate(handle, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError as exc:
            raise ValueError(f"Bad JSON on line {line_num}: {exc}") from exc

len(records)


In [None]:
from collections import Counter

def iter_label_sequences(labels):
    if labels is None:
        return []
    if isinstance(labels, list):
        if labels and all(isinstance(item, list) for item in labels):
            return labels
        return [labels]
    return []

def is_all_minus_100(seq):
    if not isinstance(seq, list) or not seq:
        return False
    return all(value == -100 for value in seq)

stats = Counter()
for record in records:
    if "chosen_labels" not in record:
        stats["missing_chosen_labels"] += 1
        continue
    sequences = iter_label_sequences(record["chosen_labels"])
    if not sequences:
        stats["non_list_or_empty"] += 1
        continue
    for seq in sequences:
        stats["total_sequences"] += 1
        if not isinstance(seq, list):
            stats["non_list_sequence"] += 1
            continue
        if not seq:
            stats["empty_sequence"] += 1
            continue
        if is_all_minus_100(seq):
            stats["all_minus_100"] += 1
        else:
            stats["has_non_minus_100"] += 1

total = stats["total_sequences"]
all_minus = stats["all_minus_100"]
rate = (all_minus / total) if total else 0.0

stats, rate


In [None]:
print(f"Total sequences: {total}")
print(f"All -100: {all_minus} ({rate:.2%})")
print(f"Missing chosen_labels: {stats['missing_chosen_labels']}")
print(f"Non-list/empty chosen_labels: {stats['non_list_or_empty']}")
print(f"Empty sequences: {stats['empty_sequence']}")
