In [1]:
API_KEY = "sk-or-v1-d0de4e0800c609c2444a1b9f50bbd9e72854522abc8835ccdbe502e80355627982"

MODEL = "deepseek/deepseek-r1-distill-llama-70b:free"

# CADEC Forum Post Entity Labeling with LLM
**Objective**
Using a suitable Large Language Model (LLM) this workflow automates the annotation of medical forum posts from the CADEC dataset. It consists of two main steps:

# Step 1: BIO Tagging (Token-Level Labeling)
For each word/token in a forum post, label it using the BIO format with the following entity classes:
- **B-ADR, I-ADR**: Adverse Drug Reactions
- **B-Drug, I-Drug**: Medication Names
- **B-Disease, I-Disease**: Diagnosed Conditions
- **B-Symptom, I-Symptom**: General Symptoms
- **O**: Outside (not part of any entity)

# Step 2: Convert BIO Output to CADEC .ann Format


In [2]:
import requests
import textwrap
import re
import os
import random
from difflib import SequenceMatcher

# Replace with your OpenRouter API key
API_KEY = API_KEY
MODEL = "deepseek/deepseek-r1-distill-llama-70b:free"  


# Prompt formatting function
def generate_bio_prompt(text: str) -> str:
    examples = (
        "Examples:\n"

        "Input:\n"
        "I had Chest muscle pains and Pins and needles before taking lovastatin.\n"
        "Output:\n"
        "I O\n"
        "had O\n"
        "Chest B-ADR\n"
        "muscle I-ADR\n"
        "pains I-ADR\n"
        "and O\n"
        "Pins B-ADR\n"
        "and I-ADR\n"
        "needles I-ADR\n"
        "before O\n"
        "taking O\n"
        "lovastatin B-Drug\n"
        ". O\n\n"

        "Input:\n"
        "Meloxicam gave me minor period bleeding and some limp in my left leg.\n"
        "Output:\n"
        "Meloxicam B-Drug\n"
        "gave O\n"
        "me O\n"
        "minor B-ADR\n"
        "period I-ADR\n"
        "bleeding I-ADR\n"
        "and O\n"
        "some O\n"
        "limp B-Symptom\n"
        "in O\n"
        "my O\n"
        "left O\n"
        "leg O\n"
        ". O\n\n"

        "Input:\n"
        "Taking Arthrotec for arthritis, but it causes gastric problems and a bit drowsy feeling.\n"
        "Output:\n"
        "Taking O\n"
        "Arthrotec B-Drug\n"
        "for O\n"
        "arthritis B-Disease\n"
        ", O\n"
        "but O\n"
        "it O\n"
        "causes O\n"
        "gastric B-ADR\n"
        "problems I-ADR\n"
        "and O\n"
        "a O\n"
        "bit B-ADR\n"
        "drowsy I-ADR\n"
        "feeling I-ADR\n"
        ". O\n\n"

        "Input:\n"
        "Arthrotec caused heavy vaginal bleeding and serious pain.\n"
        "Output:\n"
        "Arthrotec B-Drug\n"
        "caused O\n"
        "heavy B-ADR\n"
        "vaginal I-ADR\n"
        "bleeding I-ADR\n"
        "and O\n"
        "serious B-Symptom\n"
        "pain I-Symptom\n"
        ". O\n\n"

        "Input:\n"
        "I feel a bit weird after taking Diclofenac and Misoprostol.\n"
        "Output:\n"
        "I O\n"
        "feel O\n"
        "a B-ADR\n"
        "bit I-ADR\n"
        "weird I-ADR\n"
        "after O\n"
        "taking O\n"
        "Diclofenac B-Drug\n"
        "and I-Drug\n"
        "Misoprostol I-Drug\n"
        ". O\n\n"

        "Input:\n"
        "Experienced severe arthritis symptoms in hip joints and muscle cramps.\n"
        "Output:\n"
        "Experienced O\n"
        "severe B-ADR\n"
        "arthritis I-ADR\n"
        "symptoms I-ADR\n"
        "in I-ADR\n"
        "hip I-ADR\n"
        "joints I-ADR\n"
        "and O\n"
        "muscle B-ADR\n"
        "cramps I-ADR\n"
        ". O\n\n"
    )

    prompt = (
        "You are a medical NER assistant.\n"
        "Label each word in a forum post using BIO format with the following labels:\n"
        "- ADR: Adverse Drug Reaction (side effects caused by drugs)\n"
        "- Drug: Names of medications\n"
        "- Symptom: General signs or subjective conditions (not necessarily drug-induced)\n"
        "- Disease: Diagnosed medical conditions\n\n"
        "For each token (word or punctuation), output:\n"
        "<token> <BIO-label>\n"
        "Use 'O' for tokens outside any entity.\n\n"
        f"{examples}\n"
        "Now label the following text:\n"
        f"{text.strip()}\n\n"
        "Output:\n"
    )
    return prompt

 

# Call OpenRouter API with DeepSeek
def call_deepseek_api(prompt: str):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://chat.openai.com/",
        "X-Title": "cadec-bio-labeling"
    }

    data = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)

    if response.status_code == 200:
        reply = response.json()["choices"][0]["message"]["content"]
        return reply.strip()
    else:
        raise Exception(f"API Error {response.status_code}: {response.text}")

def convert_bio_to_cadec(text: str, bio_tagged_output: str):
    # Tokenize text and capture spans
    token_pattern = re.compile(r"\w+|[^\w\s]", re.UNICODE)
    tokens_with_spans = [(m.group(), m.start(), m.end()) for m in token_pattern.finditer(text)]
    token_texts = [tok for tok, _, _ in tokens_with_spans]

    # Parse BIO output
    bio_lines = [line.strip() for line in bio_tagged_output.strip().splitlines() if line.strip()]
    bio_tuples = []
    for line in bio_lines:
        if " " not in line: continue
        token, tag = line.rsplit(" ", 1)
        bio_tuples.append((token, tag))
    bio_token_texts = [tok for tok, _ in bio_tuples]

    # Align tokens via SequenceMatcher
    matcher = SequenceMatcher(None, bio_token_texts, token_texts)
    aligned = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            for bi, tj in zip(range(i1, i2), range(j1, j2)):
                aligned.append((tokens_with_spans[tj], bio_tuples[bi][1]))
        else:
            for bi in range(i1, i2):
                for tj in range(j1, j2):
                    if bio_token_texts[bi].lower() == token_texts[tj].lower():
                        aligned.append((tokens_with_spans[tj], bio_tuples[bi][1]))
                        break

    # Extract BIO entities using direct text slicing
    entities = []
    current = None
    for (tok, start, end), label in aligned:
        if label == "O":
            if current:
                entities.append(current)
                current = None
        elif label.startswith("B-"):
            if current:
                entities.append(current)
            current = {"label": label[2:], "start": start, "end": end}
        elif label.startswith("I-") and current and current["label"] == label[2:]:
            current["end"] = end
        else:
            if current:
                entities.append(current)
            current = None
    if current:
        entities.append(current)

    # Merge adjacent/overlapping same-label spans
    merged = []
    if entities:
        prev = entities[0]
        for ent in entities[1:]:
            if ent["label"] == prev["label"] and ent["start"] - prev["end"] <= 2:
                prev["end"] = ent["end"]
            else:
                merged.append(prev)
                prev = ent
        merged.append(prev)

    # Format CADEC output with true span text
    output = []
    for i, ent in enumerate(merged, 1):
        span_text = text[ent["start"]:ent["end"]]
        output.append(f"T{i}\t{ent['label']} {ent['start']} {ent['end']}\t{span_text}")
    return output

def save_bio_output(bio_output, original_file_path, bio_dir):
    # Extract filename without extension
    base_name = os.path.splitext(os.path.basename(original_file_path))[0]
    
    # Create the .txt file path in the bioTag directory
    bio_file_path = os.path.join(bio_dir, f"{base_name}.txt")
    
    # Write to .txt file
    with open(bio_file_path, "w", encoding="utf-8") as f:
        f.write(bio_output + "\n")
    
    print(f" BIO output saved to: {bio_file_path}")

def save_cadec_annotation(cadec_format_output, original_file_path, output_dir):
    # Extract filename without extension
    base_name = os.path.splitext(os.path.basename(original_file_path))[0]
    
    # Create the .ann file path in the predicted directory
    ann_file_path = os.path.join(output_dir, f"{base_name}.ann")
    
    # Write to .ann file
    with open(ann_file_path, "w", encoding="utf-8") as f:
        for line in cadec_format_output:
            f.write(line + "\n")
    
    print(f Annotation saved to: {ann_file_path}")

def process_directory(input_dir="cadec/text", bio_dir="bioTag", output_dir="predicted", num_files=1):
    # Create output directories if they don't exist
    os.makedirs(bio_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all .txt files in the input directory
    txt_files = [f for f in os.listdir(input_dir) if f.endswith(".txt")]
    
    # Select  random files
    if len(txt_files) > num_files:
        txt_files = random.sample(txt_files, num_files)
    
    for file_name in txt_files:
        file_path = os.path.join(input_dir, file_name)
        print(f"\n📂 Processing file: {file_path}")
        
        # Read the forum post text from file
        with open(file_path, 'r', encoding='utf-8') as file:
            forum_post = file.read()
        
        # Perform BIO tagging
        prompt = generate_bio_prompt(forum_post)
        try:
            bio_result = call_deepseek_api(prompt)
        except Exception as e:
            print(f"❌ Error processing {file_name}: {e}")
            continue
        
        # Save BIO output
        save_bio_output(bio_result, file_path, bio_dir)
        
        # Convert to CADEC format
        cadec_format_output = convert_bio_to_cadec(forum_post, bio_result)
        
        # Save the annotation
        save_cadec_annotation(cadec_format_output, file_path, output_dir)

if __name__ == "__main__":
    process_directory()


📂 Processing file: cadec/text\ARTHROTEC.124.txt
✅ BIO output saved to: bioTag\ARTHROTEC.124.txt
✅ Annotation saved to: predicted\ARTHROTEC.124.ann
