In [1]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys, os
sys.path.append(os.path.abspath(".."))

from utils.data_loader import get_all_files, load_annotation_file


In [3]:
# Correct base path for your dataset
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"


In [4]:
import glob, os

# Base dataset path
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"

# Look inside original folder for .ann files
files = glob.glob(os.path.join(DATA_DIR, "original", "*.ann"))
print("Total original files found:", len(files))
print("First 5 files:", files[:5])


Total original files found: 1250
First 5 files: ['C:\\Users\\satya\\Downloads\\Miimansa Problem\\Assignment\\data\\CADEC.v2\\original\\ARTHROTEC.1.ann', 'C:\\Users\\satya\\Downloads\\Miimansa Problem\\Assignment\\data\\CADEC.v2\\original\\ARTHROTEC.10.ann', 'C:\\Users\\satya\\Downloads\\Miimansa Problem\\Assignment\\data\\CADEC.v2\\original\\ARTHROTEC.100.ann', 'C:\\Users\\satya\\Downloads\\Miimansa Problem\\Assignment\\data\\CADEC.v2\\original\\ARTHROTEC.101.ann', 'C:\\Users\\satya\\Downloads\\Miimansa Problem\\Assignment\\data\\CADEC.v2\\original\\ARTHROTEC.102.ann']


In [5]:
import importlib
import utils.data_loader as dl

# Force reload the updated file
importlib.reload(dl)

from utils.data_loader import get_all_files, load_annotation_file


In [7]:
sample_file = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2\original\ARTHROTEC.1.ann"

with open(sample_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(i, repr(line))
        if i > 10:
            break


0 'T1\tADR 9 19\tbit drowsy\n'
1 '#1\tAnnotatorNotes T1\tDrowsy\n'
2 'T2\tADR 29 50\tlittle blurred vision\n'
3 '#2\tAnnotatorNotes T2\tBlurred Vision\n'
4 'T3\tDrug 93 102\tArthrotec\n'
5 'T5\tDisease 179 188\tarthritis\n'
6 'T6\tSymptom 260 265\tagony\n'
7 'T4\tADR 62 78\tgastric problems\n'
8 'T7\tSymptom 412 417\tpains\n'
9 'T8\tADR 437 453\tfeel a bit weird\n'
10 '#8\tAnnotatorNotes T7\tImplies a previous symptom of pain.\n'


In [8]:
with open(sample_file, "r", encoding="utf-8") as f:
    for line in f:
        print(repr(line.split("\t")))


['T1', 'ADR 9 19', 'bit drowsy\n']
['#1', 'AnnotatorNotes T1', 'Drowsy\n']
['T2', 'ADR 29 50', 'little blurred vision\n']
['#2', 'AnnotatorNotes T2', 'Blurred Vision\n']
['T3', 'Drug 93 102', 'Arthrotec\n']
['T5', 'Disease 179 188', 'arthritis\n']
['T6', 'Symptom 260 265', 'agony\n']
['T4', 'ADR 62 78', 'gastric problems\n']
['T7', 'Symptom 412 417', 'pains\n']
['T8', 'ADR 437 453', 'feel a bit weird\n']
['#8', 'AnnotatorNotes T7', 'Implies a previous symptom of pain.\n']


In [13]:
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"
sample_file = os.path.join(DATA_DIR, "original", "ARTHROTEC.1.ann")
parsed = load_annotation_file(sample_file)
print("Parsed:", parsed)


Parsed: [('T1', 'ADR', '9 19', 'bit drowsy'), ('T2', 'ADR', '29 50', 'little blurred vision'), ('T3', 'Drug', '93 102', 'Arthrotec'), ('T5', 'Disease', '179 188', 'arthritis'), ('T6', 'Symptom', '260 265', 'agony'), ('T4', 'ADR', '62 78', 'gastric problems'), ('T7', 'Symptom', '412 417', 'pains'), ('T8', 'ADR', '437 453', 'feel a bit weird')]


In [14]:
import os
import json
import nltk
from transformers import pipeline

# -------------------------------
# Setup paths
# -------------------------------
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"
OUTPUT_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Example file to test
filename = "ARTHROTEC.1"
text_file = os.path.join(DATA_DIR, "text", f"{filename}.txt")

with open(text_file, "r", encoding="utf-8") as f:
    text = f.read().strip()

print("Sample text snippet:\n", text[:200], "...")

# -------------------------------
# Load Biomedical NER model
# -------------------------------
ner = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="simple")

results = ner(text)

print("\nRaw model predictions (first few):")
for r in results[:5]:
    print(r)

# -------------------------------
# Map HuggingFace labels to CADEC labels
# -------------------------------
label_map = {
    "Sign_symptom": "ADR",
    "Disease_disorder": "Disease",
    "Drug": "Drug",
    "Detailed_description": "Symptom",
    "Therapeutic_procedure": "Symptom",
    "Dosage": "Symptom",
    "Duration": "Symptom",
    "Frequency": "Symptom",
    "Lab_value": "Symptom",
}

# -------------------------------
# Convert predictions to CADEC-style .ann format
# -------------------------------
annotations = []
tid = 1
for r in results:
    start, end = r["start"], r["end"]
    ent_text = text[start:end].lower()
    mapped_label = label_map.get(r["entity_group"], None)
    if not mapped_label:
        continue
    annotations.append((f"T{tid}", mapped_label, f"{start} {end}", ent_text))
    tid += 1

print("\nCADEC-style predictions (sample):")
for ann in annotations[:10]:
    print(ann)

# -------------------------------
# Save outputs
# -------------------------------
# Save .pred.ann
pred_file = os.path.join(OUTPUT_DIR, f"{filename}.pred.ann")
with open(pred_file, "w", encoding="utf-8") as f:
    for tid, label, span, ent_text in annotations:
        f.write(f"{tid}\t{label} {span}\t{ent_text}\n")

# Save JSON
pred_json = os.path.join(OUTPUT_DIR, f"{filename}_predictions.json")
with open(pred_json, "w", encoding="utf-8") as f:
    json.dump(
        [{"id": tid, "label": label, "span": span, "text": ent_text}
         for tid, label, span, ent_text in annotations],
        f,
        indent=2
    )

print(f"\n✅ Saved predictions:\n- {pred_file}\n- {pred_json}")


  from .autonotebook import tqdm as notebook_tqdm


Sample text snippet:
 I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting pro ...


Device set to use cpu



Raw model predictions (first few):
{'entity_group': 'Sign_symptom', 'score': np.float32(0.99989974), 'word': 'dr', 'start': 13, 'end': 15}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.9951048), 'word': '##owsy', 'start': 15, 'end': 19}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.7793145), 'word': 'blurred', 'start': 36, 'end': 43}
{'entity_group': 'Detailed_description', 'score': np.float32(0.96931857), 'word': 'art', 'start': 93, 'end': 96}
{'entity_group': 'Therapeutic_procedure', 'score': np.float32(0.54796815), 'word': '##hrotec', 'start': 96, 'end': 102}

CADEC-style predictions (sample):
('T1', 'ADR', '13 15', 'dr')
('T2', 'ADR', '15 19', 'owsy')
('T3', 'ADR', '36 43', 'blurred')
('T4', 'Symptom', '93 96', 'art')
('T5', 'Symptom', '96 102', 'hrotec')
('T6', 'Symptom', '103 105', '50')
('T7', 'Symptom', '110 123', 'over 10 years')
('T8', 'Disease', '179 188', 'arthritis')
('T9', 'Symptom', '286 288', '75')
('T10', 'Symptom', '289 300', 'twice a day')

✅ Saved