# EZ-DSNER — Unified DS-NER Experiment Notebook

**Pipeline:**  
1. Setup & dependency check  
2. Data preparation (format conversion)  
3. Training & inference (CuPuL + others)  
4. Post-processing  
5. Evaluation & comparison

---
## 1. Setup & Configuration

In [1]:
import sys
print(sys.executable)
print(sys.version)
!pip install seqeval==1.2.2

from seqeval.metrics import classification_report
from transformers import AdamW
print("seqeval: ok")
print("AdamW: ok")

/home/prati/anaconda3/envs/dsner/bin/python
3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]
seqeval: ok
AdamW: ok


In [2]:
import os, sys, json, glob

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# AUTO-DETECT PATHS
# The notebook assumes it lives inside the project root alongside
# dsner_wrapper.py, dsner_data.py, dsner_postprocess.py, and the
# method directories (ATSEN/, BOND/, CuPuL/, etc.).
#
# Expected layout:
#   <project_root>/
#   ├── dsner_wrapper.py
#   ├── dsner_data.py
#   ├── dsner_postprocess.py
#   ├── dsner_jupyter.ipynb        ← this notebook
#   ├── data/
#   │   └── QTL/                   ← (auto-detected)
#   │       ├── train.txt / train_ALL.txt
#   │       ├── test.txt
#   │       ├── types.txt
#   │       └── valid.txt
#   ├── ATSEN/
#   ├── BOND/
#   ├── CuPuL/
#   └── ...
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Detect project root: walk up from notebook location until we find dsner_wrapper.py
def find_project_root():
    """Find the project root by looking for dsner_wrapper.py."""
    # Try current working directory first
    candidates = [os.getcwd()]
    # Also try the notebook's own directory if running in Jupyter
    try:
        nb_dir = os.path.dirname(os.path.abspath('__file__'))
        candidates.append(nb_dir)
    except Exception:
        pass
    # Walk up from each candidate
    for start in candidates:
        d = start
        for _ in range(5):  # max 5 levels up
            if os.path.exists(os.path.join(d, 'dsner_wrapper.py')):
                return d
            d = os.path.dirname(d)
    return os.getcwd()  # fallback

def find_data_dir(project_root):
    """Find the QTL data directory containing train/test/types files."""
    data_base = os.path.join(project_root, 'data')
    if not os.path.isdir(data_base):
        return None
    # Search for a directory containing types.txt + test.txt
    for dirpath, dirnames, filenames in os.walk(data_base):
        if 'types.txt' in filenames and 'test.txt' in filenames:
            # Check it has some train file too
            if any(f in filenames for f in ['train.txt', 'train_ALL.txt']):
                return dirpath
    return None

PROJECT_ROOT = find_project_root()
DATA_DIR     = find_data_dir(PROJECT_ROOT)
WRAPPER_DIR  = PROJECT_ROOT  # wrapper files live in project root
DATASET      = "qtl"
GPU          = "0"

# Add wrapper to Python path
if WRAPPER_DIR not in sys.path:
    sys.path.insert(0, WRAPPER_DIR)

# Derived paths
GOLD_TEST  = os.path.join(DATA_DIR, "test.txt") if DATA_DIR else None
TYPES_FILE = os.path.join(DATA_DIR, "types.txt") if DATA_DIR else None
PRED_DIR   = os.path.join(PROJECT_ROOT, "predictions")
os.makedirs(PRED_DIR, exist_ok=True)

print(f"PROJECT_ROOT : {PROJECT_ROOT}")
print(f"DATA_DIR     : {DATA_DIR}")
print(f"WRAPPER_DIR  : {WRAPPER_DIR}")
print(f"PRED_DIR     : {PRED_DIR}")
print(f"GPU          : {GPU}")

PROJECT_ROOT : /home/prati/A2_Dist_NER_Agro/original_dsner
DATA_DIR     : /home/prati/A2_Dist_NER_Agro/original_dsner/data/QTL
WRAPPER_DIR  : /home/prati/A2_Dist_NER_Agro/original_dsner
PRED_DIR     : /home/prati/A2_Dist_NER_Agro/original_dsner/predictions
GPU          : 0


In [3]:
# Verify everything exists
print("=== Wrapper files ===")
for f in ["dsner_wrapper.py", "dsner_data.py", "dsner_postprocess.py"]:
    path = os.path.join(WRAPPER_DIR, f)
    ok = "✓" if os.path.exists(path) else "✗ MISSING"
    print(f"  {ok} {f}")

print("\n=== Data files ===")
if DATA_DIR:
    for f in sorted(os.listdir(DATA_DIR)):
        fpath = os.path.join(DATA_DIR, f)
        if os.path.isfile(fpath):
            n = sum(1 for _ in open(fpath, errors='replace'))
            print(f"  ✓ {f:35s} {n:>8,} lines")
else:
    print("  ✗ DATA_DIR not found! Check data/ directory.")

print("\n=== Method directories ===")
for m in ["ATSEN", "AutoNER", "BOND", "CuPuL", "DeSERT", "mproto", "RoSTER", "SCDL"]:
    path = os.path.join(PROJECT_ROOT, m)
    ok = "✓" if os.path.isdir(path) else "✗"
    print(f"  {ok} {m}")

=== Wrapper files ===
  ✓ dsner_wrapper.py
  ✓ dsner_data.py
  ✓ dsner_postprocess.py

=== Data files ===
  ✓ README.md                                 18 lines
  ✓ data_processor.py                        185 lines
  ✓ dict_match.py                            247 lines
  ✓ pred_test_balance.txt                 33,267 lines
  ✓ pred_test_high_recall.txt             33,268 lines
  ✓ test.txt                              33,295 lines
  ✓ train.txt                            532,882 lines
  ✓ types.txt                                  2 lines
  ✓ valid.txt                              1,096 lines

=== Method directories ===
  ✓ ATSEN
  ✓ AutoNER
  ✓ BOND
  ✓ CuPuL
  ✓ DeSERT
  ✓ mproto
  ✓ RoSTER
  ✓ SCDL


In [4]:
# Read entity types
with open(TYPES_FILE) as f:
    TYPES = [line.strip() for line in f if line.strip()]
print(f"Entity types: {TYPES}")

Entity types: ['Trait', 'Gene']


---
## 2. Data Preparation

Convert QTL data into the format each method expects.

In [5]:
from dsner_data import DataPreparer, FormatConverter, SUPPORTED_FORMATS, METHOD_FORMATS

print("Format                 Description")
print("─" * 70)
for name, desc in SUPPORTED_FORMATS.items():
    print(f"  {name:20s} {desc}")

print("\nMethod       Train format         Test format")
print("─" * 55)
for method, fmts in METHOD_FORMATS.items():
    print(f"  {method:10s}   {fmts['train']:18s}   {fmts['test']}")

Format                 Description
──────────────────────────────────────────────────────────────────────
  conll_bio            TOKEN BIO_TAG                  (2-col, standard CoNLL)
  conll_dist           TOKEN O DIST_LABEL             (3-col, distant-labeled train)
  conll_bio_dist       TOKEN BIO_TAG DIST_LABEL       (3-col, bio + distant)
  bond_json            [{"str_words":[], "tags":[]}]  (BOND/ATSEN/SCDL/DeSERT)
  roster               train_text.txt + label file     (RoSTER directory)
  autoner_raw          one token per line, no labels   (AutoNER raw_text.txt)
  autoner_ck           TOKEN TIE_BREAK TYPE            (AutoNER .ck truth file)
  mproto_jsonl         {"tokens":[], "entities":[]}    (MProto, one JSON/line)

Method       Train format         Test format
───────────────────────────────────────────────────────
  BOND         bond_json            bond_json
  ATSEN        bond_json            bond_json
  SCDL         bond_json            bond_json
  DeSERT       bond_jso

In [6]:
# Prepare for ALL methods
prep = DataPreparer(DATA_DIR, PROJECT_ROOT, DATASET)
results = prep.prepare_all()

print("\nData preparation results:")
print("═" * 60)
for method, path in results.items():
    ok = "✓" if not str(path).startswith("ERROR") else "✗"
    print(f"  {ok} {method:10s} → {path}")

[2026-02-27 13:12:22] INFO dsner_data: Auto-detected format: conll_dist
[2026-02-27 13:12:22] INFO dsner_data: read_conll_dist: 18706 sentences from /home/prati/A2_Dist_NER_Agro/original_dsner/data/QTL/train.txt
[2026-02-27 13:12:22] INFO dsner_data: Auto-detected format: conll_bio
[2026-02-27 13:12:22] INFO dsner_data: read_conll_bio: 1045 sentences from /home/prati/A2_Dist_NER_Agro/original_dsner/data/QTL/test.txt
[2026-02-27 13:12:22] INFO dsner_data: Auto-detected format: conll_bio
[2026-02-27 13:12:22] INFO dsner_data: read_conll_bio: 25 sentences from /home/prati/A2_Dist_NER_Agro/original_dsner/data/QTL/valid.txt
[2026-02-27 13:12:22] INFO dsner_data: DataPreparer: qtl  types=['Trait', 'Gene']  train=18706  test=1045  dev=25
[2026-02-27 13:12:23] INFO dsner_data: write_bond_json: 18706 sents → /home/prati/A2_Dist_NER_Agro/original_dsner/ATSEN/dataset/qtl/qtl_train.json
[2026-02-27 13:12:23] INFO dsner_data: write_bond_json: 1045 sents → /home/prati/A2_Dist_NER_Agro/original_dsner


Data preparation results:
════════════════════════════════════════════════════════════
  ✓ ATSEN      → /home/prati/A2_Dist_NER_Agro/original_dsner/ATSEN/dataset/qtl
  ✓ AutoNER    → /home/prati/A2_Dist_NER_Agro/original_dsner/AutoNER/data/qtl
  ✓ BOND       → /home/prati/A2_Dist_NER_Agro/original_dsner/BOND/dataset/qtl
  ✓ CuPuL      → /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl
  ✓ DeSERT     → /home/prati/A2_Dist_NER_Agro/original_dsner/DeSERT/dataset/qtl
  ✓ MProto     → /home/prati/A2_Dist_NER_Agro/original_dsner/mproto/data/qtl
  ✓ RoSTER     → /home/prati/A2_Dist_NER_Agro/original_dsner/RoSTER/data/qtl
  ✓ SCDL       → /home/prati/A2_Dist_NER_Agro/original_dsner/SCDL/dataset


In [7]:
# Prepare for JUST CuPuL
cupul_data_path = prep.prepare_for("CuPuL")
print(f"CuPuL data: {cupul_data_path}")

for f in sorted(os.listdir(cupul_data_path)):
    size = os.path.getsize(os.path.join(cupul_data_path, f))
    print(f"  {f:25s} {size:>10,} bytes")

[2026-02-27 13:12:25] INFO dsner_data: write_conll_dist: 18706 sents → /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl/train.ALL.txt
[2026-02-27 13:12:25] INFO dsner_data: write_conll_bio: 18706 sents → /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl/train.txt
[2026-02-27 13:12:25] INFO dsner_data: write_conll_bio: 1045 sents → /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl/test.txt
[2026-02-27 13:12:25] INFO dsner_data: write_conll_bio: 25 sents → /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl/dev.txt


CuPuL data: /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL/data/qtl
  dev.txt                        9,077 bytes
  test.txt                     267,134 bytes
  train.ALL.txt              5,092,990 bytes
  train.txt                  4,136,212 bytes


In [8]:
# Peek at CuPuL data
print("=== CuPuL train (first lines with entities) ===")
# Check which train file exists
for tname in ["train.ALL.txt", "train.txt"]:
    tpath = os.path.join(cupul_data_path, tname)
    if os.path.exists(tpath):
        count = 0
        with open(tpath) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 3 and parts[2] != "0":
                    print(f"  {line.strip()}")
                    count += 1
                    if count >= 6:
                        break
                elif len(parts) >= 2 and parts[1] not in ("O", "0"):
                    print(f"  {line.strip()}")
                    count += 1
                    if count >= 6:
                        break
        break

print("\n=== CuPuL test.txt (first entities) ===")
test_path = os.path.join(cupul_data_path, "test.txt")
if os.path.exists(test_path):
    count = 0
    with open(test_path) as f:
        for line in f:
            if "B-" in line:
                print(f"  {line.strip()}")
                count += 1
                if count >= 4:
                    break

=== CuPuL train (first lines with entities) ===
  shear O 1
  force O 1
  shear O 1
  force O 1
  meat O 1
  tenderness O 1

=== CuPuL test.txt (first entities) ===
  lambing B-Trait
  litter B-Trait
  lambing B-Trait
  lambing B-Trait


### Ad-hoc format conversion

Convert between any pair of formats on the fly:

In [9]:
fc = FormatConverter(types=TYPES)

# Read from any format (auto-detects if fmt omitted)
sents = fc.read(GOLD_TEST)
print(f"Read {len(sents)} test sentences")
print(f"Tokens: {sents[0].tokens[:8]}")
print(f"Tags  : {sents[0].bio_tags[:8]}")

# Example conversions (uncomment to run):
# fc.convert(GOLD_TEST, "/tmp/test.json",  dst_fmt="bond_json")
# fc.convert(GOLD_TEST, "/tmp/test.jsonl", dst_fmt="mproto_jsonl")
# fc.convert(GOLD_TEST, "/tmp/test.ck",    dst_fmt="autoner_ck")

[2026-02-27 13:12:25] INFO dsner_data: Auto-detected format: conll_bio
[2026-02-27 13:12:25] INFO dsner_data: read_conll_bio: 1045 sentences from /home/prati/A2_Dist_NER_Agro/original_dsner/data/QTL/test.txt


Read 1045 test sentences
Tokens: ['Association', 'analysis', 'of', 'PLIN2', 'gene', 'polymorphisms', 'and', 'lambing']
Tags  : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Trait']


---
## 3. Training & Inference

### 3a. CuPuL

In [10]:
from dsner_wrapper import DSNERWrapper, CuPuLConfig, CONFIG_REGISTRY
import dataclasses

In [11]:
# Inspect CuPuL config defaults
cfg_preview = CuPuLConfig(dataset=DATASET, gpu_ids=GPU)
print("CuPuL config fields:")
print("─" * 55)
for f in dataclasses.fields(cfg_preview):
    val = getattr(cfg_preview, f.name)
    if f.name not in ("extra", "dict_names", "tag2idx") and val != "" and val != {}:
        print(f"  {f.name:38s} = {val}")

CuPuL config fields:
───────────────────────────────────────────────────────
  dataset                                = qtl
  gpu_ids                                = 0
  seed                                   = 0
  max_seq_length                         = 128
  train_batch_size                       = 32
  eval_batch_size                        = 32
  learning_rate                          = 2e-05
  num_train_epochs                       = 1
  output_dir                             = output
  pretrained_model                       = roberta-base
  tag_scheme                             = io
  temp_dir                               = temp
  do_train                               = True
  do_eval                                = True
  eval_on                                = test
  train_on                               = train
  loss_type                              = MAE
  gradient_accumulation_steps            = 1
  noise_train_update_interval            = 200
  self_train_update_i

In [12]:
# Create the CuPuL wrapper
cupul = DSNERWrapper(
    method="CuPuL",
    project_root=PROJECT_ROOT,
    dataset=DATASET,
    gpu_ids=GPU,

    # ── Model ──
    pretrained_model="roberta-base",
    tag_scheme="io",
    max_seq_length=300,

    # ── Phase 1: Noise-robust training ──
    learning_rate=5e-7,
    num_train_epochs=1,
    drop_other=0.5,
    loss_type="MAE",
    m=20,
    num_models=5,

    # ── Phase 2: Curriculum training ──
    curriculum_train_lr=2e-7,
    curriculum_train_epochs=5,
    curriculum_train_sub_epochs=1,

    # ── Phase 3: Self-training ──
    self_train_lr=5e-7,
    self_train_epochs=5,
    self_train_update_interval=100,
)

print(f"Method dir : {cupul.runner.method_dir}")
print(f"Dataset    : {cupul.config.dataset}")
print(f"GPU        : {cupul.config.gpu_ids}")

[2026-02-27 13:12:25] INFO dsner_wrapper: Initialized DSNERWrapper: method=CuPuL, dataset=qtl, gpu=0


Method dir : /home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL
Dataset    : qtl
GPU        : 0


In [13]:
# Preview the exact command
parts = ["python train.py --do_train --do_eval"] + cupul.runner._build_common_args()
cmd = " ".join(parts)

print("Command preview:")
print("═" * 80)
for part in cmd.split(" --"):
    if part.startswith("python"):
        print(part + " \\")
    else:
        print(f"    --{part} \\")

Command preview:
════════════════════════════════════════════════════════════════════════════════
python train.py \
    --do_train \
    --do_eval \
    --dataset_name qtl \
    --pretrained_model roberta-base \
    --max_seq_length 300 \
    --tag_scheme io \
    --loss_type MAE \
    --train_batch_size 32 \
    --eval_batch_size 32 \
    --gradient_accumulation_steps 1 \
    --train_lr 5e-07 \
    --curriculum_train_lr 2e-07 \
    --train_epochs 1 \
    --curriculum_train_epochs 5 \
    --curriculum_train_sub_epochs 1 \
    --num_models 5 \
    --warmup_proportion 0.1 \
    --weight_decay 0.01 \
    --drop_other 0.5 \
    --drop_entity 0.1 \
    --seed 0 \
    --self_train_lr 5e-07 \
    --self_train_epochs 5 \
    --student1_lr 1e-05 \
    --student2_lr 1e-05 \
    --entity_threshold 0.8 \
    --ratio 0.1 \
    --m 20 \
    --noise_train_update_interval 200 \
    --self_train_update_interval 100 \


In [14]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║              TRAIN CuPuL — uncomment to run                 ║
# ╚═══════════════════════════════════════════════════════════════╝

cupul.train()

[2026-02-27 13:12:25] INFO dsner_wrapper: Starting training: CuPuL on qtl
[2026-02-27 13:12:25] INFO dsner_wrapper: Running command:
  cwd=/home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL
  cmd=python train.py --do_train --do_eval --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100
Traceback (most recent call last):
  File "train.py", line 229, in <module>
    main()
  File "train.py", line 210, in main
    classifier = NERClassifier(args)
  File "/h


curriculum_train_epochs 5
curriculum_train_lr 2e-07
curriculum_train_sub_epochs 1
dataset_name qtl
do_eval True
do_train True
drop_entity 0.1
drop_other 0.5
entity_threshold 0.8
eval_batch_size 32
eval_on test
gradient_accumulation_steps 1
loss_type MAE
m 20.0
max_seq_length 300
noise_train_update_interval 200
num_models 5
output_dir ../data/qtl/output
pretrained_model roberta-base
priors [0.0314966102568, 0.0376880632424, 0.0354240324761, 0.015502139428]
ratio 0.1
seed 0
self_train_epochs 5
self_train_lr 5e-07
self_train_update_interval 100
student1_lr 1e-05
student2_lr 1e-05
tag_scheme io
temp_dir ../data/qtl/temp
train_batch_size 32
train_epochs 1
train_lr 5e-07
train_on train
warmup_proportion 0.1
weight_decay 0.01




CompletedProcess(args='python train.py --do_train --do_eval --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100', returncode=1)

In [15]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║            PREDICT CuPuL — uncomment to run                 ║
# ╚═══════════════════════════════════════════════════════════════╝

cupul.predict()

[2026-02-27 13:12:26] INFO dsner_wrapper: Starting prediction: CuPuL
[2026-02-27 13:12:26] INFO dsner_wrapper: Running command:
  cwd=/home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL
  cmd=python predict.py --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100
Traceback (most recent call last):
  File "predict.py", line 210, in <module>
    main()
  File "predict.py", line 203, in main
    trainer = NERClassifier(args)
  File "/home/prati/A2_Dist_NER_A


curriculum_train_epochs 5
curriculum_train_lr 2e-07
curriculum_train_sub_epochs 1
dataset_name qtl
do_eval False
do_train False
drop_entity 0.1
drop_other 0.5
entity_threshold 0.8
eval_batch_size 32
eval_on test
gradient_accumulation_steps 1
loss_type MAE
m 20.0
max_seq_length 300
noise_train_update_interval 200
num_models 5
output_dir ../data/qtl/output
pretrained_model roberta-base
priors [0.0314966102568, 0.0376880632424, 0.0354240324761, 0.015502139428]
ratio 0.1
seed 0
self_train_epochs 5
self_train_lr 5e-07
self_train_update_interval 100
student1_lr 1e-05
student2_lr 1e-05
tag_scheme io
temp_dir ../data/qtl/temp
train_batch_size 32
train_epochs 1
train_lr 5e-07
train_on train
warmup_proportion 0.1
weight_decay 0.01




CompletedProcess(args='python predict.py --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100', returncode=1)

In [16]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║           EVALUATE CuPuL — uncomment to run                 ║
# ╚═══════════════════════════════════════════════════════════════╝

cupul.evaluate()

[2026-02-27 13:12:28] INFO dsner_wrapper: Starting evaluation: CuPuL on qtl
[2026-02-27 13:12:28] INFO dsner_wrapper: Running command:
  cwd=/home/prati/A2_Dist_NER_Agro/original_dsner/CuPuL
  cmd=python train.py --do_eval --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100



curriculum_train_epochs 5
curriculum_train_lr 2e-07
curriculum_train_sub_epochs 1
dataset_name qtl
do_eval True
do_train False
drop_entity 0.1
drop_other 0.5
entity_threshold 0.8
eval_batch_size 32
eval_on test
gradient_accumulation_steps 1
loss_type MAE
m 20.0
max_seq_length 300
noise_train_update_interval 200
num_models 5
output_dir ../data/qtl/output
pretrained_model roberta-base
priors [0.0314966102568, 0.0376880632424, 0.0354240324761, 0.015502139428]
ratio 0.1
seed 0
self_train_epochs 5
self_train_lr 5e-07
self_train_update_interval 100
student1_lr 1e-05
student2_lr 1e-05
tag_scheme io
temp_dir ../data/qtl/temp
train_batch_size 32
train_epochs 1
train_lr 5e-07
train_on train
warmup_proportion 0.1
weight_decay 0.01




CompletedProcess(args='python train.py --do_eval --dataset_name qtl --pretrained_model roberta-base --max_seq_length 300 --tag_scheme io --loss_type MAE --train_batch_size 32 --eval_batch_size 32 --gradient_accumulation_steps 1 --train_lr 5e-07 --curriculum_train_lr 2e-07 --train_epochs 1 --curriculum_train_epochs 5 --curriculum_train_sub_epochs 1 --num_models 5 --warmup_proportion 0.1 --weight_decay 0.01 --drop_other 0.5 --drop_entity 0.1 --seed 0 --self_train_lr 5e-07 --self_train_epochs 5 --student1_lr 1e-05 --student2_lr 1e-05 --entity_threshold 0.8 --ratio 0.1 --m 20 --noise_train_update_interval 200 --self_train_update_interval 100', returncode=0)

### 3b. BOND

In [17]:
# bond = DSNERWrapper(
#     method="BOND",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     model_type="roberta",
#     model_name_or_path="roberta-base",
#     learning_rate=5e-5,
#     num_train_epochs=3,
#     max_seq_length=128,
#     self_training=True,
#     mt=1,
#     mt_updatefreq=5,
# )
# bond.train()

### 3c. RoSTER

In [18]:
# roster = DSNERWrapper(
#     method="RoSTER",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     pretrained_model="roberta-base",
#     tag_scheme="io",
#     max_seq_length=300,
#     noise_train_lr=3e-5,
#     noise_train_epochs=3,
#     ensemble_train_lr=1e-5,
#     ensemble_train_epochs=2,
#     self_train_lr=5e-7,
#     self_train_epochs=5,
#     q=0.7,
#     tau=0.7,
#     seed=30,
# )
# roster.train()

### 3d. ATSEN

In [19]:
# atsen = DSNERWrapper(
#     method="ATSEN",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     learning_rate=1e-5,
#     warmup_steps=200,
#     begin_epoch=1,
#     period=6000,
#     threshold=0.9,
#     num_train_epochs=50,
#     al=0.8,
#     bate=1.0,
# )
# atsen.train()

### 3e. DeSERT

In [20]:
# desert = DSNERWrapper(
#     method="DeSERT",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     threshold=0.9,
#     num_train_epochs=50,
# )
# desert.train()

### 3f. SCDL

In [21]:
# scdl = DSNERWrapper(
#     method="SCDL",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     learning_rate=2e-5,
#     begin_epoch=6,
#     period=3200,
#     num_train_epochs=50,
# )
# scdl.train()

### 3g. MProto

In [22]:
# mproto = DSNERWrapper(
#     method="MProto",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gpu_ids=GPU,
#     config_path="cfg/qtl/mproto/train.toml",
# )
# mproto.train()

### 3h. AutoNER

In [23]:
# autoner = DSNERWrapper(
#     method="AutoNER",
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     model_name=DATASET,
#     hid_dim=300,
#     word_dim=200,
#     num_train_epochs=50,
#     learning_rate=0.05,
#     optimizer="SGD",
# )
# autoner.train()
# autoner.evaluate()

---
## 4. Post-Processing


| Rule | What it does |
|------|--------|
|`span_consistency` | All case-insensitive matches of entity spans → entity |
|`prep_bridging` | [ent] of [ent] → bridge the preposition |
|`abbrev_resolution` | "body weight (BW)" → propagate labels |
|`pos_filtering` | Demote singleton non-noun predictions |

In [24]:
from dsner_postprocess import (
    PostProcessor,
    collect_predictions,
    evaluate_predictions,
    read_predictions,
    write_predictions,
    collect_from_conll_pred,
    AVAILABLE_RULES,
    DEFAULT_RULE_ORDER,
)

print("Available rules:", sorted(AVAILABLE_RULES))
print(f"Default order  : {' → '.join(DEFAULT_RULE_ORDER)}")

Available rules: ['abbrev_resolution', 'pos_filtering', 'prep_bridging', 'span_consistency']
Default order  : span_consistency → prep_bridging → span_consistency → abbrev_resolution → span_consistency → pos_filtering


### 4a. Post-process an existing prediction file

If you already have a prediction file in `TOKEN GOLD_TAG PRED_INT` format (e.g. `pred_test_high_recall.txt`):

In [25]:
# Auto-detect existing prediction files in the data directory
pred_files = []
if DATA_DIR:
    for f in os.listdir(DATA_DIR):
        if f.startswith("pred_") and f.endswith(".txt"):
            pred_files.append(os.path.join(DATA_DIR, f))

if pred_files:
    print("Found prediction files:")
    for i, pf in enumerate(pred_files):
        n = sum(1 for _ in open(pf))
        print(f"  [{i}] {os.path.basename(pf):40s} {n:>8,} lines")
else:
    print("No prediction files found in data directory.")
    print("Run training first, or place a prediction file in:", DATA_DIR)

Found prediction files:
  [0] pred_test_balance.txt                      33,267 lines
  [1] pred_test_high_recall.txt                  33,268 lines


In [26]:
# ── Pick a prediction file to post-process ──
# Option 1: Use one of the auto-detected files
# PRED_FILE = pred_files[0]  # e.g. pred_test_high_recall.txt

# Option 2: Explicit path
# PRED_FILE = os.path.join(DATA_DIR, "pred_test_high_recall.txt")

# Uncomment one of the above, then run the cells below
print("Set PRED_FILE and uncomment the cells below")

Set PRED_FILE and uncomment the cells below


In [27]:
# ── Evaluate BEFORE post-processing ──

# pp = PostProcessor(PRED_FILE)
# print(f"Loaded: {pp.summary()}")
# print()
# results_before = pp.evaluate(TYPES)
# print(f"Before (strict):  P={results_before['strict_precision']:.4f}  R={results_before['strict_recall']:.4f}  F1={results_before['strict_f1']:.4f}")# print(f"Before (relaxed):  P={results_before['relaxed_precision']:.4f}  R={results_before['relaxed_recall']:.4f}  F1={results_before['relaxed_f1']:.4f}")

In [28]:
# ── Apply ALL rules ──

# pp.run_all()

In [29]:
# ── Evaluate AFTER ──

# results_after = pp.evaluate(TYPES)
# print(f"After (strict):  P={results_after['strict_precision']:.4f}  R={results_after['strict_recall']:.4f}  F1={results_after['strict_f1']:.4f}")
# print(f"After (relaxed):  P={results_after['relaxed_precision']:.4f}  R={results_after['relaxed_recall']:.4f}  F1={results_after['relaxed_f1']:.4f}")
# delta_s = results_after['strict_f1'] - results_before['strict_f1']
# delta_r = results_after['relaxed_f1'] - results_before['relaxed_f1']
# print(f"F1 Δ strict={'+' if delta_s >= 0 else ''}{delta_s:.4f}  relaxed={'+' if delta_r >= 0 else ''}{delta_r:.4f}")

In [30]:
# ── Save ──

# output_file = os.path.join(PRED_DIR, "pred_final.txt")
# pp.save(output_file)
# print(f"Saved: {output_file}")

### 4b. Collect predictions from a trained method

In [31]:
# After training a method, collect its predictions into unified format:

# unified_file = collect_predictions(
#     method="CuPuL",              # or "BOND", "RoSTER", etc.
#     project_root=PROJECT_ROOT,
#     dataset=DATASET,
#     gold_file=GOLD_TEST,
#     types=TYPES,
#     output=os.path.join(PRED_DIR, "cupul_unified.txt"),
# )

# # Then post-process
# pp = PostProcessor(unified_file)
# pp.run_all()
# pp.evaluate(TYPES)
# pp.save(os.path.join(PRED_DIR, "cupul_final.txt"))

### 4c. Custom rule selection

In [32]:
# Apply only specific rules:
# pp2 = PostProcessor(PRED_FILE)
# pp2.run(rules=["prep_bridging", "span_consistency"])
# pp2.evaluate(TYPES)

# Custom trait words / prepositions:
# pp3 = PostProcessor(PRED_FILE,
#     prepositions={"of", "in", "for", "to", "with"},
#     noun_tags={"NN", "NNS", "NNP", "NNPS"},
# )
# pp3.run_all()
# pp3.evaluate(TYPES)

---
## 5. Batch Comparison

In [33]:
def run_method(method_name, wrapper_kwargs, pred_file=None):
    """Full pipeline: train → predict → collect → postprocess → evaluate."""
    import traceback

    print(f"\n{'═' * 60}")
    print(f"  Running: {method_name}")
    print(f"{'═' * 60}")

    try:
        w = DSNERWrapper(
            method=method_name,
            project_root=PROJECT_ROOT,
            dataset=DATASET,
            gpu_ids=GPU,
            **wrapper_kwargs,
        )
        w.train()
        try:
            w.predict()
        except Exception:
            pass

        out = os.path.join(PRED_DIR, f"{method_name.lower()}_{DATASET}_unified.txt")
        unified = collect_predictions(
            method=method_name, project_root=PROJECT_ROOT, dataset=DATASET,
            gold_file=GOLD_TEST, pred_file=pred_file, types=TYPES, output=out,
        )

        raw = evaluate_predictions(read_predictions(unified), TYPES)
        pp = PostProcessor(unified)
        pp.run_all()
        pp_res = pp.evaluate(TYPES)
        pp.save(out.replace("_unified", "_final"))

        return {
            "method": method_name,
            "raw_strict_P": raw["strict_precision"], "raw_strict_R": raw["strict_recall"], "raw_strict_F1": raw["strict_f1"],
            "pp_strict_P": pp_res["strict_precision"], "pp_strict_R": pp_res["strict_recall"], "pp_strict_F1": pp_res["strict_f1"],
            "raw_relaxed_P": raw["relaxed_precision"], "raw_relaxed_R": raw["relaxed_recall"], "raw_relaxed_F1": raw["relaxed_f1"],
            "pp_relaxed_P": pp_res["relaxed_precision"], "pp_relaxed_R": pp_res["relaxed_recall"], "pp_relaxed_F1": pp_res["relaxed_f1"],
            "status": "ok",
        }
    except Exception as e:
        traceback.print_exc()
        return {"method": method_name, "status": f"FAILED: {e}"}

print("run_method() defined")

run_method() defined


In [34]:
# ── Uncomment methods to compare ──

# all_results = []

# all_results.append(run_method("CuPuL", {
#     "pretrained_model": "roberta-base", "tag_scheme": "io", "max_seq_length": 300,
#     "learning_rate": 5e-7, "num_train_epochs": 1, "drop_other": 0.5,
#     "loss_type": "MAE", "m": 20,
#     "curriculum_train_lr": 2e-7, "curriculum_train_epochs": 5,
#     "self_train_lr": 5e-7, "self_train_epochs": 5,
# }))

# all_results.append(run_method("RoSTER", {
#     "pretrained_model": "roberta-base", "tag_scheme": "io", "max_seq_length": 300,
#     "noise_train_lr": 3e-5, "noise_train_epochs": 3,
#     "ensemble_train_lr": 1e-5, "self_train_lr": 5e-7, "self_train_epochs": 5,
#     "seed": 30,
# }))

# all_results.append(run_method("BOND", {
#     "model_type": "roberta", "model_name_or_path": "roberta-base",
#     "learning_rate": 5e-5, "num_train_epochs": 3, "self_training": True, "mt": 1,
# }))

# all_results.append(run_method("ATSEN", {
#     "learning_rate": 1e-5, "num_train_epochs": 50,
#     "begin_epoch": 1, "period": 6000, "threshold": 0.9, "al": 0.8, "bate": 1.0,
# }))

# all_results.append(run_method("DeSERT", {"threshold": 0.9, "num_train_epochs": 50}))

# all_results.append(run_method("SCDL", {
#     "learning_rate": 2e-5, "begin_epoch": 6, "period": 3200, "num_train_epochs": 50,
# }))

In [35]:
# ── Display comparison table ──

# import pandas as pd
# ok_results = [r for r in all_results if r.get("status") == "ok"]
# if ok_results:
#     df = pd.DataFrame(ok_results)
#     cols = ["method", "raw_P", "raw_R", "raw_F1", "pp_P", "pp_R", "pp_F1"]
#     df = df[[c for c in cols if c in df.columns]].sort_values("pp_relaxed_F1", ascending=False)
#     print(df.to_string(index=False, float_format="%.4f"))

---
## 6. Utilities

In [36]:
# List all methods and config sizes
for method, cfg_cls in CONFIG_REGISTRY.items():
    c = cfg_cls(dataset=DATASET, gpu_ids=GPU)
    n = len(dataclasses.fields(c))
    print(f"  {method:10s}  {n:3d} config fields")

  ATSEN        32 config fields
  AutoNER      35 config fields
  BOND         57 config fields
  CuPuL        39 config fields
  DeSERT       34 config fields
  MProto       15 config fields
  RoSTER       31 config fields
  SCDL         30 config fields


In [37]:
# Inspect any method's full config
METHOD_TO_INSPECT = "CuPuL"  # change to any method

cfg_cls = CONFIG_REGISTRY[METHOD_TO_INSPECT]
c = cfg_cls(dataset=DATASET, gpu_ids=GPU)
print(f"{METHOD_TO_INSPECT} config:")
print("─" * 55)
for f in dataclasses.fields(c):
    val = getattr(c, f.name)
    if f.name not in ("extra",):
        print(f"  {f.name:38s} = {val}")

CuPuL config:
───────────────────────────────────────────────────────
  dataset                                = qtl
  gpu_ids                                = 0
  seed                                   = 0
  max_seq_length                         = 128
  train_batch_size                       = 32
  eval_batch_size                        = 32
  learning_rate                          = 2e-05
  num_train_epochs                       = 1
  output_dir                             = output
  pretrained_model                       = roberta-base
  tag_scheme                             = io
  temp_dir                               = temp
  do_train                               = True
  do_eval                                = True
  eval_on                                = test
  train_on                               = train
  loss_type                              = MAE
  gradient_accumulation_steps            = 1
  noise_train_update_interval            = 200
  self_train_update_interval

In [38]:
# Quick evaluate any 3-column prediction file
# tokens = read_predictions("/path/to/pred.txt")
# evaluate_predictions(tokens, TYPES)

---
## Quick Reference

```
project_root/
├── dsner_wrapper.py          # 8 method configs + runners
├── dsner_data.py             # 8 format converters
├── dsner_postprocess.py      # 5 post-processing rules + eval
├── dsner_jupyter.ipynb       # this notebook
├── data/QTL/                 # input data
│   ├── train.txt / train_ALL.txt
│   ├── test.txt
│   ├── valid.txt
│   └── types.txt
├── predictions/              # output (auto-created)
├── ATSEN/  BOND/  CuPuL/  DeSERT/
├── RoSTER/  SCDL/  AutoNER/  mproto/
└── ...

Pipeline:
  data/QTL/ → DataPreparer → method-specific format
           → DSNERWrapper.train()
           → DSNERWrapper.predict()
           → collect_predictions()
           → PostProcessor.run_all()
           → P / R / F1
```