In [None]:
!pip install transformers datasets seqeval accelerate gliner -q

In [None]:
from datasets import load_dataset
from gliner import GLiNER
from tqdm import tqdm

# 1. Load the Dataset
# This dataset is designed for text generation, so we will just use the 'note' column.
dataset = load_dataset("AGBonnet/augmented-clinical-notes", split="train")

# 2. Initialize the "Teacher" Model (GLiNER)
# This model is smart enough to find entities without training
teacher_model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

# 3. Define what we want to find
labels_to_extract = ["symptom", "disease", "medication"]

# 4. Create a "Synthetic" NER Dataset
# We will process 500 samples (enough for a hackathon demo)
tagged_samples = []
texts = dataset.select(range(500))["note"] # We use the 'note' column

print("Auto-labeling data... this might take 2-3 minutes on GPU.")

for text in tqdm(texts):
    # The Teacher finds the entities
    entities = teacher_model.predict_entities(text, labels_to_extract)
    
    # Store in a format we can use for BioBERT
    tagged_samples.append({
        "text": text,
        "entities": entities  # List of {'text': 'chest pain', 'label': 'symptom', ...}
    })

print(f"Successfully labeled {len(tagged_samples)} notes!")

In [None]:
from transformers import AutoTokenizer
import numpy as np

model_checkpoint = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def create_bio_labels_from_gliner(sample):
    text = sample["text"]
    predicted_entities = sample["entities"]
    
    # Tokenize the text
    tokenized_input = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_offsets_mapping=True)
    
    # Create blank labels (0 = Outside)
    labels = [0] * len(tokenized_input["input_ids"])
    offsets = tokenized_input["offset_mapping"]
    
    # Map GLiNER entities to BioBERT tokens
    for ent in predicted_entities:
        start_char = ent["start"]
        end_char = ent["end"]
        
        # Find which tokens correspond to this character range
        entity_start_token = None
        entity_end_token = None
        
        for idx, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue # Skip special tokens
            
            if o_start == start_char:
                entity_start_token = idx
            if o_end == end_char:
                entity_end_token = idx
        
        # If exact match found, mark it
        if entity_start_token is not None:
            labels[entity_start_token] = 1 # B-Entity
            # Mark inside tokens
            # (This is a simplified logic for hackathons; assumes entities are short)
            if entity_end_token:
                for i in range(entity_start_token + 1, entity_end_token + 1):
                    labels[i] = 2 # I-Entity
            else:
                # Fallback for multi-token entities not perfectly aligned
                pass 

    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": labels
    }

# Convert our list to a Hugging Face Dataset object
from datasets import Dataset
synthetic_dataset = Dataset.from_list(tagged_samples)
final_dataset = synthetic_dataset.map(create_bio_labels_from_gliner)

In [None]:
import os
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

# 1. Disable WANDB
os.environ["WANDB_DISABLED"] = "true"

# 2. Setup Model
id2label = {0: "O", 1: "B-ENTITY", 2: "I-ENTITY"}
label2id = {"O": 0, "B-ENTITY": 1, "I-ENTITY": 2}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
)

# 3. Training Args (UPDATED)
args = TrainingArguments(
    "biobert-gliner-finetuned",
    eval_strategy="no",
    learning_rate=5e-5,             # Slightly higher LR to help it learn faster
    per_device_train_batch_size=16,
    num_train_epochs=10,            # <--- CHANGED FROM 1 TO 10
    weight_decay=0.01,
    save_strategy="epoch",
    logging_steps=10,
    report_to="none",
    fp16=True,
    load_best_model_at_end=False
)

# Re-initialize to reset weights
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=final_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

trainer.train()

In [None]:
from transformers import pipeline

# Load the trained model from memory
classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Test on a medical sentence
text = "Patient denies fever but complains of severe migraine and nausea."
results = classifier(text)

print(results)

In [None]:
print("Phase 2")

In [None]:
import pandas as pd
import numpy as np

# ==========================================
# STEP 1: LOAD & CLEAN DATA (FIXED)
# ==========================================
print("Loading and Cleaning Synthea Data...")

conditions = pd.read_csv("conditions.csv")
observations = pd.read_csv("observations.csv")

# Rename columns
conditions = conditions.rename(columns={'DESCRIPTION': 'CONDITION', 'PATIENT': 'PATIENT_ID'})
observations = observations.rename(columns={'DESCRIPTION': 'TEST_NAME', 'PATIENT': 'PATIENT_ID'})

# --- RULE 1: REMOVE QUESTIONS ---
# Added 'na=False' to prevent crash on empty rows
# Added 'r' to fix SyntaxWarning
observations = observations[~observations['TEST_NAME'].str.contains(r'\?', regex=True, na=False)]

# --- RULE 2: LENGTH FILTER ---
# Remove anything longer than 50 characters (surveys/questions)
observations = observations[observations['TEST_NAME'].str.len() < 50]

# --- RULE 3: THE "LAB LOOK" FILTER ---
# We keep rows that contain typical lab words OR standard units brackets []
lab_keywords = [
    'Panel', 'Blood', 'Urine', 'Glucose', 'Hemoglobin', 'Creatinine', 
    'Cholesterol', 'X-ray', 'CT', 'Scan', 'Culture', 'Count', 'Ratio', 
    'GFR', 'Triglycerides', 'Electrolytes', 'Bilirubin', 'Protein',
    'Metabolic', 'Lipid', 'CBC', 'T3', 'T4', 'TSH'
]
pattern_keep = '|'.join(lab_keywords)

# Filter: Must match a keyword OR have unit brackets []
# Added 'na=False' here as well just in case
observations = observations[
    (observations['TEST_NAME'].str.contains(pattern_keep, case=False, na=False)) |
    (observations['TEST_NAME'].str.contains(r'\[', regex=True, na=False))
]

print(f"Data Cleaned. Tracking {observations['TEST_NAME'].nunique()} unique CLINICAL tests.")

# ==========================================
# STEP 2: TRAIN PROBABILITIES
# ==========================================
print("Training Probabilistic Model...")

merged = pd.merge(conditions[['PATIENT_ID', 'CONDITION']], 
                  observations[['PATIENT_ID', 'TEST_NAME']], 
                  on='PATIENT_ID')

# Count how many times a Test happens for a Condition
knowledge_base = merged.groupby(['CONDITION', 'TEST_NAME']).size().reset_index(name='count')

# Normalize by total Condition count to get %
condition_counts = merged['CONDITION'].value_counts().to_dict()
knowledge_base['total_cases'] = knowledge_base['CONDITION'].map(condition_counts)
knowledge_base['probability'] = knowledge_base['count'] / knowledge_base['total_cases']

# Sort
knowledge_base = knowledge_base.sort_values(['CONDITION', 'probability'], ascending=[True, False])

print("Model Retrained!")

# ==========================================
# STEP 3: INFERENCE FUNCTION
# ==========================================
def get_test_recommendations(symptoms_list, top_n=5):
    recommendations = pd.DataFrame()
    print(f"\nAnalyzing Symptoms: {symptoms_list}")
    
    for symptom in symptoms_list:
        matches = knowledge_base[knowledge_base['CONDITION'].str.contains(symptom, case=False, na=False)]
        if not matches.empty:
            recommendations = pd.concat([recommendations, matches])
    
    if recommendations.empty:
        return []
    
    # Use MAX probability across symptoms
    final_ranking = recommendations.groupby('TEST_NAME')['probability'].max().sort_values(ascending=False)
    return final_ranking.head(top_n)

# ==========================================
# STEP 4: DEMO
# ==========================================
print("\nTop 5 Common Conditions in your Dataset:")
print(merged['CONDITION'].value_counts().head(5))
# ... (Assume you have already loaded and filtered 'observations' as per the previous code)

# 1. Count the total rows (The height of the table)
final_row_count = len(observations)

# 2. Count the unique tests (The number of distinct labels)
unique_test_count = observations['TEST_NAME'].nunique()

print(f"--- FILTERING REPORT ---")
print(f"Final rows (Total Volume): {final_row_count}")
print(f"Unique Tests (Distinct Types): {unique_test_count}")
# Test 1: Diabetes
symptoms_2 = ['diabetes']
results_2 = get_test_recommendations(symptoms_2)

print("\n--- DIAGNOSTIC PATH FOR DIABETES ---")
for test, prob in results_2.items():
    print(f"[ ] {test} (Confidence: {prob:.1%})")

# Test 2: Hypertension
symptoms_3 = ['hypertension']
results_3 = get_test_recommendations(symptoms_3)

print("\n--- DIAGNOSTIC PATH FOR HYPERTENSION ---")
for test, prob in results_3.items():
    print(f"[ ] {test} (Confidence: {prob:.1%})")

symptoms_4 = ['pneumonia']
results_4 = get_test_recommendations(symptoms_4)

for test, prob in results_4.items():
    print(f"[ ] {test} (Confidence: {prob:.1%})")