In [1]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch transformers pandas tqdm


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install nltk

Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import json
import nltk
from transformers import pipeline

# -------------------------------
# 1. Setup directories
# -------------------------------
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"
OUTPUT_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs"

TASK2_DIR = os.path.join(OUTPUT_DIR, "task2")
os.makedirs(TASK2_DIR, exist_ok=True)

filename = "ARTHROTEC.1"
sample_text_file = os.path.join(DATA_DIR, "text", f"{filename}.txt")

with open(sample_text_file, "r", encoding="utf-8") as f:
    text = f.read().strip()

print("Sample Text:\n", text[:200], "...")

# -------------------------------
# 2. Load Biomedical NER model
# -------------------------------
ner = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="simple")

# -------------------------------
# 3. Run NER model
# -------------------------------
results = ner(text)

print("\nRaw NER Output (first few):")
for r in results[:5]:
    print(r)

# -------------------------------
# 4. Label Mapping (to CADEC labels)
# -------------------------------
label_map = {
    "Sign_symptom": "ADR",
    "Detailed_description": "Symptom",
    "Disease_disorder": "Disease",
    "Drug": "Drug",
    "Therapeutic_procedure": "Symptom",
    "Dosage": "Symptom",
    "Duration": "Symptom",
    "Frequency": "Symptom",
    "Lab_value": "Symptom",
    # fallback
}

# -------------------------------
# 5. Convert NER to BIO format
# -------------------------------
nltk.download("punkt")
tokens = nltk.word_tokenize(text)

bio_tags = []
for token in tokens:
    label = "O"
    for r in results:
        if token in r["word"]:   # simple overlap check
            mapped_label = label_map.get(r["entity_group"], "O")
            label = "B-" + mapped_label
            break
    bio_tags.append((token, label))

print("\nBIO Format (first 20 tokens):")
for token, label in bio_tags[:20]:
    print(token, "→", label)

# -------------------------------
# 6. Convert to CADEC-style annotations
# -------------------------------
annotations = []
tid = 1
for r in results:
    start, end = r["start"], r["end"]
    entity_text = text[start:end].lower()
    mapped_label = label_map.get(r["entity_group"], "O")
    if mapped_label == "O":
        continue  # skip unmapped
    annotations.append((f"T{tid}", mapped_label, f"{start} {end}", entity_text))
    tid += 1

print("\nCADEC-style Annotations (mapped):")
for ann in annotations:
    print(ann)

# -------------------------------
# 7. Save predictions in CADEC format (.pred.ann)
# -------------------------------
output_ann = os.path.join(TASK2_DIR, f"{filename}.pred.ann")
with open(output_ann, "w", encoding="utf-8") as f:
    for tid, label, span, entity_text in annotations:
        f.write(f"{tid}\t{label} {span}\t{entity_text}\n")

print(f"\n✅ Saved predictions to: {output_ann}")

# -------------------------------
# 8. Save predictions also as JSON
# -------------------------------
output_json = os.path.join(TASK2_DIR, f"{filename}_predictions.json")
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(
        [{"id": tid, "label": label, "span": span, "text": entity_text} 
         for tid, label, span, entity_text in annotations],
        f,
        indent=2
    )

print(f"✅ Predictions also saved to: {output_json}")


Sample Text:
 I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting pro ...


Device set to use cpu



Raw NER Output (first few):
{'entity_group': 'Sign_symptom', 'score': np.float32(0.99989974), 'word': 'dr', 'start': 13, 'end': 15}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.9951048), 'word': '##owsy', 'start': 15, 'end': 19}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.7793145), 'word': 'blurred', 'start': 36, 'end': 43}
{'entity_group': 'Detailed_description', 'score': np.float32(0.96931857), 'word': 'art', 'start': 93, 'end': 96}
{'entity_group': 'Therapeutic_procedure', 'score': np.float32(0.54796815), 'word': '##hrotec', 'start': 96, 'end': 102}

BIO Format (first 20 tokens):
I → O
feel → O
a → B-Symptom
bit → O
drowsy → O
& → O
have → B-Symptom
a → B-Symptom
little → O
blurred → B-ADR
vision → O
, → O
so → O
far → O
no → O
gastric → O
problems → O
. → O
I → O
've → O

CADEC-style Annotations (mapped):
('T1', 'ADR', '13 15', 'dr')
('T2', 'ADR', '15 19', 'owsy')
('T3', 'ADR', '36 43', 'blurred')
('T4', 'Symptom', '93 96', 'art')
('T5', 'Symptom', '96 102', '

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\satya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
