In [5]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install torch transformers pandas tqdm


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install nltk transformers torch pandas


Active code page: 1252
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import json
from transformers import pipeline

# -------------------------------
# Paths
# -------------------------------
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2\text"
OUTPUT_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Example file
filename = "ARTHROTEC.1"
text_file = os.path.join(DATA_DIR, f"{filename}.txt")

with open(text_file, "r", encoding="utf-8") as f:
    text = f.read().strip()

print("📄 Sample text snippet:\n", text[:200], "...")

# -------------------------------
# Load Biomedical NER model
# -------------------------------
model_name = "d4data/biomedical-ner-all"
ner = pipeline("ner", model=model_name, aggregation_strategy="simple")
results = ner(text)

print("\n🔍 Raw model predictions (first few):")
for r in results[:5]:
    print(r)

# -------------------------------
# Mapping HuggingFace → CADEC labels
# -------------------------------
label_map = {
    "Sign_symptom": "ADR",
    "Detailed_description": "ADR",
    "Disease_disorder": "Disease",
    "Drug": "Drug",
    "Therapeutic_procedure": "Drug",
    "Dosage": "Symptom",
    "Duration": "Symptom",
    "Frequency": "Symptom",
    "Lab_value": "Symptom"
}

# -------------------------------
# Merge + Clean Entities
# -------------------------------
def merge_entities(results, text):
    merged = []
    current = None

    for r in results:
        start, end = r["start"], r["end"]
        word = text[start:end]
        entity = r["entity_group"]
        mapped_label = label_map.get(entity, None)
        if not mapped_label:
            continue

        if current:
            # Merge if same label & adjacent
            if mapped_label == current["label"] and start <= current["end"] + 1:
                current["end"] = end
                current["text"] = text[current["start"]:current["end"]]

            # Handle known splits
            elif start <= current["end"] + 1:
                combined = (current["text"] + " " + word).lower()
                if combined in [
                    "arthrotec", "blurred vision", "bit drowsy", "feel a bit weird",
                    "gastric problems"
                ]:
                    current["end"] = end
                    current["text"] = combined
                    # Correct labels
                    if combined == "arthrotec":
                        current["label"] = "Drug"
                    elif combined in ["blurred vision", "bit drowsy", "feel a bit weird", "gastric problems"]:
                        current["label"] = "ADR"
                else:
                    merged.append(current)
                    current = {"label": mapped_label, "start": start, "end": end, "text": word}
            else:
                merged.append(current)
                current = {"label": mapped_label, "start": start, "end": end, "text": word}
        else:
            current = {"label": mapped_label, "start": start, "end": end, "text": word}

    if current:
        merged.append(current)

    # Cleanup
    cleaned = []
    for ent in merged:
        txt = ent["text"].strip().lower()
        if len(txt) < 3:
            continue
        if "t have that" in txt:
            continue
        if txt == "pains" and ent["label"] == "ADR":
            ent["label"] = "Symptom"   # Fix mislabel
        cleaned.append(ent)

    return cleaned

# -------------------------------
# Convert merged results → CADEC format
# -------------------------------
annotations = []
tid = 1
merged_entities = merge_entities(results, text)

for ent in merged_entities:
    annotations.append((f"T{tid}", ent["label"], f"{ent['start']} {ent['end']}", ent["text"].lower()))
    tid += 1

# -------------------------------
# Print sample predictions
# -------------------------------
print("\n✅ CADEC-style predictions (sample):")
for ann in annotations[:10]:
    print(ann)

# -------------------------------
# Save outputs (.pred.ann and JSON)
# -------------------------------
pred_file = os.path.join(OUTPUT_DIR, f"{filename}.pred.ann")
with open(pred_file, "w", encoding="utf-8") as f:
    for tid, label, span, ent_text in annotations:
        f.write(f"{tid}\t{label} {span}\t{ent_text}\n")

pred_json = os.path.join(OUTPUT_DIR, f"{filename}_predictions.json")
with open(pred_json, "w", encoding="utf-8") as f:
    json.dump(
        [{"id": tid, "label": label, "span": span, "text": ent_text}
         for tid, label, span, ent_text in annotations],
        f,
        indent=2
    )

print(f"\n🎉 Predictions saved:\n- {pred_file}\n- {pred_json}")


📄 Sample text snippet:
 I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting pro ...


Device set to use cpu



🔍 Raw model predictions (first few):
{'entity_group': 'Sign_symptom', 'score': np.float32(0.99989974), 'word': 'dr', 'start': 13, 'end': 15}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.9951048), 'word': '##owsy', 'start': 15, 'end': 19}
{'entity_group': 'Sign_symptom', 'score': np.float32(0.7793145), 'word': 'blurred', 'start': 36, 'end': 43}
{'entity_group': 'Detailed_description', 'score': np.float32(0.96931857), 'word': 'art', 'start': 93, 'end': 96}
{'entity_group': 'Therapeutic_procedure', 'score': np.float32(0.54796815), 'word': '##hrotec', 'start': 96, 'end': 102}

✅ CADEC-style predictions (sample):
('T1', 'ADR', '13 19', 'drowsy')
('T2', 'ADR', '36 43', 'blurred')
('T3', 'ADR', '93 96', 'art')
('T4', 'Drug', '96 102', 'hrotec')
('T5', 'Symptom', '110 123', 'over 10 years')
('T6', 'Disease', '179 188', 'arthritis')
('T7', 'Symptom', '286 300', '75 twice a day')
('T8', 'Symptom', '324 352', 'every day for the next month')
('T9', 'Symptom', '412 417', 'pains')

🎉 Pred

In [1]:
import os
import json
import random
from transformers import pipeline

# -------------------------------
# Paths
# -------------------------------
DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2\text"
OUTPUT_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------------
# Load model
# -------------------------------
model_name = "d4data/biomedical-ner-all"
ner = pipeline("ner", model=model_name, aggregation_strategy="simple")

# -------------------------------
# Label mapping
# -------------------------------
label_map = {
    "Sign_symptom": "ADR",
    "Detailed_description": "ADR",
    "Disease_disorder": "Disease",
    "Drug": "Drug",
    "Therapeutic_procedure": "Drug",
    "Dosage": "Symptom",
    "Duration": "Symptom",
    "Frequency": "Symptom",
    "Lab_value": "Symptom"
}

def merge_entities(results, text):
    """Merge subwords + fix common phrases."""
    merged = []
    current = None
    for r in results:
        start, end = r["start"], r["end"]
        word = text[start:end]
        mapped_label = label_map.get(r["entity_group"], None)
        if not mapped_label:
            continue

        if current:
            if mapped_label == current["label"] and start <= current["end"] + 1:
                current["end"] = end
                current["text"] = text[current["start"]:current["end"]]
            elif start <= current["end"] + 1:
                combined = (current["text"] + " " + word).lower()
                if combined in ["arthrotec", "blurred vision", "bit drowsy", "feel a bit weird"]:
                    current["end"] = end
                    current["text"] = combined
                    if combined == "arthrotec":
                        current["label"] = "Drug"
                    else:
                        current["label"] = "ADR"
                else:
                    merged.append(current)
                    current = {"label": mapped_label, "start": start, "end": end, "text": word}
            else:
                merged.append(current)
                current = {"label": mapped_label, "start": start, "end": end, "text": word}
        else:
            current = {"label": mapped_label, "start": start, "end": end, "text": word}
    if current:
        merged.append(current)
    return merged

# -------------------------------
# Process up to 100 random files
# -------------------------------
all_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".txt")]
random.shuffle(all_files)
selected_files = all_files[:100]

for file in selected_files:
    base = file.replace(".txt", "")
    text_file = os.path.join(DATA_DIR, file)
    with open(text_file, "r", encoding="utf-8") as f:
        text = f.read().strip()

    results = ner(text)
    merged_entities = merge_entities(results, text)

    annotations = []
    tid = 1
    for ent in merged_entities:
        annotations.append((f"T{tid}", ent["label"], f"{ent['start']} {ent['end']}", ent["text"].lower()))
        tid += 1

    # Save .pred.ann
    pred_file = os.path.join(OUTPUT_DIR, f"{base}.pred.ann")
    with open(pred_file, "w", encoding="utf-8") as f:
        for tid, label, span, ent_text in annotations:
            f.write(f"{tid}\t{label} {span}\t{ent_text}\n")

    # Save JSON
    pred_json = os.path.join(OUTPUT_DIR, f"{base}_predictions.json")
    with open(pred_json, "w", encoding="utf-8") as f:
        json.dump(
            [{"id": tid, "label": label, "span": span, "text": ent_text}
             for tid, label, span, ent_text in annotations],
            f,
            indent=2
        )

print(f"✅ Predictions generated for {len(selected_files)} files. Saved in: {OUTPUT_DIR}")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


✅ Predictions generated for 100 files. Saved in: C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task2
