In [1]:
import os
import json
import zipfile
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

# Create directories
os.makedirs("../data/classifier_v2", exist_ok=True)
os.makedirs("../models/condition_classifier_v2", exist_ok=True)

### Load Raw Data and Metadata

In [2]:
# Load conditions metadata
with open("../data/ddxplus/release_conditions.json", 'r') as f:
    conditions_data = json.load(f)
print(f"Conditions loaded: {len(conditions_data)}")

# Load evidences (symptoms) metadata
with open("../data/ddxplus/release_evidences.json", 'r') as f:
    evidences_data = json.load(f)
print(f"Evidences loaded: {len(evidences_data)}")

# Load raw patient data from all splits
def load_raw_data(zip_path, split_name):
    print(f"\nLoading {split_name} data...")
    with zipfile.ZipFile(zip_path, 'r') as z:
        with z.open(z.namelist()[0]) as f:
            df = pd.read_csv(f)
    print(f"   Records: {len(df):,}")
    return df

train_df = load_raw_data("../data/ddxplus/release_train_patients.zip", "train")
val_df = load_raw_data("../data/ddxplus/release_validate_patients.zip", "validation")
test_df = load_raw_data("../data/ddxplus/release_test_patients.zip", "test")

print("DATA LOADED SUCCESSFULLY")
print(f"Total records: {len(train_df) + len(val_df) + len(test_df):,}")

Conditions loaded: 49
Evidences loaded: 223

Loading train data...
   Records: 1,025,602

Loading validation data...
   Records: 132,448

Loading test data...
   Records: 134,529
DATA LOADED SUCCESSFULLY
Total records: 1,292,579


### Explore Evidence Structure

In [4]:
print("EXPLORING EVIDENCE STRUCTURE")

# Look at a sample patient's evidences
sample_evidences_raw = train_df.iloc[0]['EVIDENCES']
print("Sample raw EVIDENCES string:")
print(sample_evidences_raw[:200] + "...")

# Parse it
sample_evidences = eval(sample_evidences_raw)
print(f"\nNumber of evidences for this patient: {len(sample_evidences)}")

print("\nSample evidences (first 10):")
for ev in sample_evidences[:10]:
    print(f"   {ev}")

# Understand the evidence format
print("EVIDENCE FORMAT ANALYSIS")

# Categorize evidence formats
simple_evidences = []      # E_XX (binary yes/no)
valued_evidences = []      # E_XX_@_V_YY (has specific value)

for ev in sample_evidences:
    if '_@_' in ev:
        valued_evidences.append(ev)
    else:
        simple_evidences.append(ev)

print(f"Simple evidences (binary): {len(simple_evidences)}")
print(f"Valued evidences (with values): {len(valued_evidences)}")

print("\nExamples of valued evidences:")
for ev in valued_evidences[:5]:
    parts = ev.split('_@_')
    base_code = parts[0]
    value = parts[1] if len(parts) > 1 else None
    
    # Get evidence info
    if base_code in evidences_data:
        ev_info = evidences_data[base_code]
        question = ev_info.get('question_en', 'N/A')
        value_meaning = ev_info.get('value_meaning', {})
        
        print(f"\n   Code: {ev}")
        print(f"   Question: {question}...")
        print(f"   Value meanings: {value_meaning}")

EXPLORING EVIDENCE STRUCTURE
Sample raw EVIDENCES string:
['E_48', 'E_50', 'E_53', 'E_54_@_V_161', 'E_54_@_V_183', 'E_55_@_V_89', 'E_55_@_V_108', 'E_55_@_V_167', 'E_56_@_4', 'E_57_@_V_123', 'E_58_@_3', 'E_59_@_3', 'E_77', 'E_79', 'E_91', 'E_97', 'E_201', 'E_...

Number of evidences for this patient: 19

Sample evidences (first 10):
   E_48
   E_50
   E_53
   E_54_@_V_161
   E_54_@_V_183
   E_55_@_V_89
   E_55_@_V_108
   E_55_@_V_167
   E_56_@_4
   E_57_@_V_123
EVIDENCE FORMAT ANALYSIS
Simple evidences (binary): 9
Valued evidences (with values): 10

Examples of valued evidences:

   Code: E_54_@_V_161
   Question: Characterize your pain:...
   Value meanings: {'V_11': {'fr': 'NA', 'en': 'NA'}, 'V_71': {'fr': 'déchirante', 'en': 'heartbreaking'}, 'V_112': {'fr': 'lancinante', 'en': 'haunting'}, 'V_154': {'fr': 'pénible', 'en': 'tedious'}, 'V_161': {'fr': 'sensible', 'en': 'sensitive'}, 'V_179': {'fr': 'un coup de couteau', 'en': 'a knife stroke'}, 'V_180': {'fr': 'un tiraillement', 'en':

### Define Text Processing Function

In [6]:
def get_evidence_text(evidence_code, evidences_data):
    
    # Check if evidence has a value
    if '_@_' in evidence_code:
        parts = evidence_code.split('_@_')
        base_code = parts[0]
        value_code = parts[1]
    else:
        base_code = evidence_code
        value_code = None
    
    # Get evidence info
    if base_code not in evidences_data:
        return None
    
    ev_info = evidences_data[base_code]
    question = ev_info.get('question_en', '')
    
    # Clean up question to make it a statement
    question = question.replace("Do you have ", "has ")
    question = question.replace("Have you ", "has ")
    question = question.replace("Are you ", "is ")
    question = question.replace("Is there ", "has ")
    question = question.replace("Do you ", "")
    question = question.replace("Did you ", "")
    question = question.replace("?", "")
    question = question.replace(":", "")
    question = question.strip()
    
    # Truncate if too long
    if len(question) > 80:
        question = question[:80]
    
    # If no value, return just the question
    if value_code is None:
        return question
    
    # Get value meaning
    value_meaning = ev_info.get('value_meaning', {})
    
    # Handle numeric values (e.g., E_56_@_4)
    if value_code.isdigit():
        return f"{question}: {value_code}/10"
    
    # Handle categorical values (e.g., V_161)
    if value_code in value_meaning:
        value_text = value_meaning[value_code].get('en', value_code)
        return f"{question}: {value_text}"
    
    return f"{question}: {value_code}"


def create_patient_text_v2(row, evidences_data):

    # Get demographics
    age = row['AGE']
    sex = 'Male' if row['SEX'] == 'M' else 'Female'
    
    # Parse evidences
    try:
        evidences = eval(row['EVIDENCES'])
    except:
        return None
    
    # Convert all evidences to text
    symptom_texts = []
    for ev in evidences:
        text = get_evidence_text(ev, evidences_data)
        if text and text not in symptom_texts:
            symptom_texts.append(text)
    
    if not symptom_texts:
        return None
    
    # Create full patient description
    symptoms_joined = "; ".join(symptom_texts)
    patient_text = f"Patient is a {age} year old {sex} presenting with: {symptoms_joined}"
    
    return patient_text


# Test the function
print("TESTING IMPROVED TEXT PROCESSING")
test_row = train_df.iloc[0]
test_text = create_patient_text_v2(test_row, evidences_data)

print(f"Age: {test_row['AGE']}")
print(f"Sex: {test_row['SEX']}")
print(f"Condition: {test_row['PATHOLOGY']}")
print(f"\nGenerated text ({len(test_text)} chars):")
print(test_text)

# Test a few more samples
print("Additional samples:")
for i in [100, 500, 1000]:
    row = train_df.iloc[i]
    text = create_patient_text_v2(row, evidences_data)
    print(f"\nSample {i}:")
    print(f"   Condition: {row['PATHOLOGY']}")
    print(f"   Text length: {len(text)} chars")
    print(f"   Preview: {text[:150]}...")

TESTING IMPROVED TEXT PROCESSING
Age: 18
Sex: M
Condition: URTI

Generated text (787 chars):
Patient is a 18 year old Male presenting with: live with 4 or more people; has had significantly increased sweating; has pain somewhere, related to your reason for consulting; Characterize your pain: sensitive; Characterize your pain: heavy; feel pain somewhere: forehead; feel pain somewhere: cheek(R); feel pain somewhere: temple(L); How intense is the pain: 4/10; Does the pain radiate to another location: nowhere; How precisely is the pain located: 3/10; How fast did the pain appear: 3/10; has a cough that produces colored or more abundant sputum than usual; smoke cigarettes; has a fever (either felt or measured with a thermometer); has a sore throat; has a cough; has traveled out of the country in the last 4 weeks: N; is exposed to secondhand cigarette smoke on a daily basis
Additional samples:

Sample 100:
   Condition: Influenza
   Text length: 1155 chars
   Preview: Patient is a 48 year ol

### Process All Data and Check Uniqueness

In [7]:
def process_dataframe(df, evidences_data, split_name):

    print(f"\nProcessing {split_name} data ({len(df):,} records)...")
    
    processed = []
    skipped = 0
    
    for idx, row in df.iterrows():
        text = create_patient_text_v2(row, evidences_data)
        
        if text is None:
            skipped += 1
            continue
        
        processed.append({
            'text': text,
            'condition': row['PATHOLOGY'],
            'age': int(row['AGE']),
            'sex': row['SEX']
        })
        
        if (idx + 1) % 200000 == 0:
            print(f"   Processed {idx + 1:,} records...")
    
    print(f"   Complete: {len(processed):,} valid, {skipped:,} skipped")
    
    return processed

# Process all splits
train_processed = process_dataframe(train_df, evidences_data, "train")
val_processed = process_dataframe(val_df, evidences_data, "validation")
test_processed = process_dataframe(test_df, evidences_data, "test")

# Check uniqueness
print("UNIQUENESS CHECK")
train_texts = set(item['text'] for item in train_processed)
val_texts = set(item['text'] for item in val_processed)
test_texts = set(item['text'] for item in test_processed)

print(f"\nTrain: {len(train_processed):,} total, {len(train_texts):,} unique ({len(train_texts)/len(train_processed)*100:.2f}%)")
print(f"Val:   {len(val_processed):,} total, {len(val_texts):,} unique ({len(val_texts)/len(val_processed)*100:.2f}%)")
print(f"Test:  {len(test_processed):,} total, {len(test_texts):,} unique ({len(test_texts)/len(test_processed)*100:.2f}%)")

# Check train-test overlap
train_test_overlap = train_texts.intersection(test_texts)
train_val_overlap = train_texts.intersection(val_texts)

print(f"\nTrain-Test overlap: {len(train_test_overlap):,} texts ({len(train_test_overlap)/len(test_texts)*100:.2f}% of test)")
print(f"Train-Val overlap:  {len(train_val_overlap):,} texts ({len(train_val_overlap)/len(val_texts)*100:.2f}% of val)")


Processing train data (1,025,602 records)...
   Processed 200,000 records...
   Processed 400,000 records...
   Processed 600,000 records...
   Processed 800,000 records...
   Processed 1,000,000 records...
   Complete: 1,025,602 valid, 0 skipped

Processing validation data (132,448 records)...
   Complete: 132,448 valid, 0 skipped

Processing test data (134,529 records)...
   Complete: 134,529 valid, 0 skipped
UNIQUENESS CHECK

Train: 1,025,602 total, 995,779 unique (97.09%)
Val:   132,448 total, 132,232 unique (99.84%)
Test:  134,529 total, 134,259 unique (99.80%)

Train-Test overlap: 4,820 texts (3.59% of test)
Train-Val overlap:  4,403 texts (3.33% of val)


#### Analysis: v1 vs v2 Comparison

| Metric | v1 (Before) | v2 (Now) | Improvement |
|:-------|:------------|:---------|:------------|
| Train unique | ~80% | 97.09% | +17% |
| Test unique | ~80% | 99.80% | +20% |
| Train-Test overlap | 99.35% | 3.59% | **95.76% less overlap** |
| Clean test samples | 875 | ~129,709 | **148x more samples** |

#### Key Findings

| Finding | Meaning |
|:--------|:--------|
| 97-99% unique texts | Including all symptoms + values + age + sex created diverse texts |
| 3.59% overlap | Acceptable - some patients genuinely have identical symptoms |
| ~130K clean test samples | Sufficient data for statistically reliable evaluation |

#### Why This Matters

- **v1 Problem:** 99.35% of test data overlapped with training data, causing data leakage
- **v2 Solution:** Only 3.59% overlap - model will be evaluated on genuinely unseen data
- **Result:** We can now trust our accuracy metrics as genuine model performance

### Save Processed Data

In [8]:
# Create label mappings
all_conditions = sorted(set(item['condition'] for item in train_processed))
label2id = {condition: idx for idx, condition in enumerate(all_conditions)}
id2label = {idx: condition for condition, idx in label2id.items()}

print("LABEL MAPPINGS")
print(f"Number of classes: {len(label2id)}")

# Save processed data
print("SAVING PROCESSED DATA")

# Save train
train_path = "../data/classifier_v2/train.json"
with open(train_path, 'w') as f:
    json.dump(train_processed, f)
print(f"Saved: {train_path}")

# Save validation
val_path = "../data/classifier_v2/val.json"
with open(val_path, 'w') as f:
    json.dump(val_processed, f)
print(f"Saved: {val_path}")

# Save test
test_path = "../data/classifier_v2/test.json"
with open(test_path, 'w') as f:
    json.dump(test_processed, f)
print(f"Saved: {test_path}")

# Save label mappings
with open("../data/classifier_v2/label2id.json", 'w') as f:
    json.dump(label2id, f, indent=2)
print("Saved: ../data/classifier_v2/label2id.json")

with open("../data/classifier_v2/id2label.json", 'w') as f:
    json.dump(id2label, f, indent=2)
print("Saved: ../data/classifier_v2/id2label.json")

# Verify file sizes
print("FILE SIZES")

files = [
    "../data/classifier_v2/train.json",
    "../data/classifier_v2/val.json", 
    "../data/classifier_v2/test.json",
    "../data/classifier_v2/label2id.json",
    "../data/classifier_v2/id2label.json"
]

total_size = 0
for filepath in files:
    size = os.path.getsize(filepath) / (1024 * 1024)
    total_size += size
    print(f"   {filepath.split('/')[-1]:<20} {size:>8.2f} MB")

print(f"   {'Total:':<20} {total_size:>8.2f} MB")

LABEL MAPPINGS
Number of classes: 49
SAVING PROCESSED DATA
Saved: ../data/classifier_v2/train.json
Saved: ../data/classifier_v2/val.json
Saved: ../data/classifier_v2/test.json
Saved: ../data/classifier_v2/label2id.json
Saved: ../data/classifier_v2/id2label.json
FILE SIZES
   train.json             983.30 MB
   val.json               129.12 MB
   test.json              130.84 MB
   label2id.json            0.00 MB
   id2label.json            0.00 MB
   Total:                1243.27 MB
