## **Data Preparation & Geometry Definition**

**A. Collect All 66 Conditons we want to map to ICD code**

In [1]:
import pandas as pd

# Replace this path with the actual location of your metadata file
METADATA_PATH = './data/coarse_labeled_metadata_with_labels.csv' 

df = pd.read_csv(METADATA_PATH)
condition_list = sorted(df['condition'].astype(str).str.strip().unique().tolist())

print(f"--- Unique Conditions ({len(condition_list)}) ---")
print(condition_list)

--- Unique Conditions (66) ---
['Abrasion, scrape, or scab', 'Abscess', 'Acne', 'Actinic Keratosis', 'Acute and chronic dermatitis', 'Acute dermatitis, NOS', 'Allergic Contact Dermatitis', 'Basal Cell Carcinoma', 'Bullous Pemphigoid', 'CD - Contact dermatitis', 'Cellulitis', 'Chronic dermatitis, NOS', 'Cutaneous lupus', 'Dermatofibroma', 'Drug Rash', 'Ecthyma', 'Eczema', 'Erythema multiforme', 'Folliculitis', 'Granuloma annulare', 'Herpes Simplex', 'Herpes Zoster', 'Hyperpigmentation', 'Hypersensitivity', 'Impetigo', 'Infected eczema', 'Inflicted skin lesions', 'Insect Bite', 'Intertrigo', 'Irritant Contact Dermatitis', "Kaposi's sarcoma of skin", 'Keratosis pilaris', 'Leukocytoclastic Vasculitis', 'Lichen Simplex Chronicus', 'Lichen nitidus', 'Lichen planus/lichenoid eruption', 'Melanoma', 'Melasma', 'Miliaria', 'Molluscum Contagiosum', 'O/E - ecchymoses present', 'Perioral Dermatitis', 'Petechiae', 'Photodermatitis', 'Pigmented purpuric eruption', 'Pityriasis rosea', 'Pityriasis rubr

**B. Mapping 66 Conditions to ICD codes**

In [3]:
import pandas as pd
from pathlib import Path
from typing import Dict, List
import json
import os # Needed for os.makedirs

# --- CONFIGURATION ---
ICD_FILE_PATH = './data/icd10-jan2026.csv'

# 1. THE 66 CONDITIONS (Provided by User)
USER_CONDITIONS_LIST = [
    'Abrasion, scrape, or scab', 'Abscess', 'Acne', 'Actinic Keratosis', 'Acute and chronic dermatitis', 
    'Acute dermatitis, NOS', 'Allergic Contact Dermatitis', 'Basal Cell Carcinoma', 'Bullous Pemphigoid', 
    'CD - Contact dermatitis', 'Cellulitis', 'Chronic dermatitis, NOS', 'Cutaneous lupus', 'Dermatofibroma', 
    'Drug Rash', 'Ecthyma', 'Eczema', 'Erythema multiforme', 'Folliculitis', 'Granuloma annulare', 
    'Herpes Simplex', 'Herpes Zoster', 'Hyperpigmentation', 'Hypersensitivity', 'Impetigo', 
    'Infected eczema', 'Inflicted skin lesions', 'Insect Bite', 'Intertrigo', 'Irritant Contact Dermatitis', 
    "Kaposi's sarcoma of skin", 'Keratosis pilaris', 'Leukocytoclastic Vasculitis', 'Lichen Simplex Chronicus', 
    'Lichen nitidus', 'Lichen planus/lichenoid eruption', 'Melanoma', 'Melasma', 'Miliaria', 
    'Molluscum Contagiosum', 'O/E - ecchymoses present', 'Perioral Dermatitis', 'Petechiae', 
    'Photodermatitis', 'Pigmented purpuric eruption', 'Pityriasis rosea', 'Pityriasis rubra pilaris', 
    'Post-Inflammatory hyperpigmentation', 'Prurigo nodularis', 'Psoriasis', 'Purpura', 'Rosacea', 
    'SCC/SCCIS', 'Scabies', 'Scar Condition', 'Seborrheic Dermatitis', 'Skin infection', 
    'Stasis Dermatitis', 'Superficial wound of body region', 'Tinea', 'Tinea Versicolor', 
    'Urticaria', 'Verruca vulgaris', 'Viral Exanthem', 'Vitiligo', 'Xerosis'
]


def find_icd_codes_for_conditions(conditions: List[str], icd_file_path: str) -> Dict[str, str]:
    """
    Automates the search for the most general ICD-10 code corresponding to each condition.
    
    Args:
        conditions: List of dermatology condition names.
        icd_file_path: Path to the ICD-10 CSV file.
        
    Returns:
        Dictionary mapping condition name to formatted ICD code.
    """
    try:
        icd_df = pd.read_csv(icd_file_path)
    except FileNotFoundError:
        print(f"ERROR: ICD file not found at {icd_file_path}.")
        return {}

    # Rename columns and prepare text for searching
    icd_df.columns = ['CODE', 'SHORT_DESC', 'LONG_DESC', 'NF_EXCL', 'UNNAMED']
    icd_df['SEARCH_DESC'] = icd_df['LONG_DESC'].astype(str).str.lower().str.replace(r'[^\w\s]', '', regex=True)
    icd_df['DESC_LEN'] = icd_df['LONG_DESC'].str.len()
    
    user_icd_map = {}

    for condition in conditions:
        condition_lower = condition.lower()
        
        # Priority 1: Simple Containment Search
        matches = icd_df[icd_df['SEARCH_DESC'].str.contains(condition_lower, na=False)].copy()
        
        # Priority 2: Fallback for complex/abbreviated terms (like 'CD - Contact dermatitis')
        if matches.empty:
            condition_simple = condition_lower.split(' - ')[0].split(',')[0].strip()
            matches = icd_df[icd_df['SEARCH_DESC'].str.contains(condition_simple, na=False)].copy()

        if not matches.empty:
            # Heuristic: Prioritize shortest description (most general ICD code)
            best_match = matches.sort_values(by=['DESC_LEN', 'CODE'], ascending=[True, True]).iloc[0]
            icd_code = best_match['CODE']
            
            # Format code to standard L30.9
            if len(icd_code) > 3 and '.' not in icd_code:
                formatted_code = icd_code[:3] + '.' + icd_code[3:]
            else:
                formatted_code = icd_code
                
            user_icd_map[condition] = formatted_code
        else:
            user_icd_map[condition] = 'NOT_FOUND'

    print(f"Successfully mapped {len(user_icd_map)} conditions.")
    print(f"Entries requiring manual clinical review: {list(user_icd_map.values()).count('NOT_FOUND')}")
    
    return user_icd_map

# --- EXECUTION ---
# This dictionary contains the auto-mapped ICD codes for your 66 conditions.
# You must review this dictionary before calculating the D-Matrix.
USER_ICD_MAP = find_icd_codes_for_conditions(USER_CONDITIONS_LIST, ICD_FILE_PATH)

print("\n--- AUTO-GENERATED ICD MAPPING (66 Conditions) ---")
print(USER_ICD_MAP)

# 2. Create the Condition-to-Index Map
# Sort the conditions alphabetically to ensure consistent indices (0 to 65)
sorted_conditions = sorted(USER_ICD_MAP.keys())
CONDITION_IDX_MAP = {condition: i for i, condition in enumerate(sorted_conditions)}
OUTPUT_MAP_PATH = Path('./data/condition_idx_map.json') # Define the output path

# 3. Create the output directory if it doesn't exist
os.makedirs(OUTPUT_MAP_PATH.parent, exist_ok=True)

# 4. Save the map as a JSON file
with open(OUTPUT_MAP_PATH, 'w') as f:
    json.dump(CONDITION_IDX_MAP, f, indent=4)

print("--- Final Artifacts Generated ---")
print(f"Successfully created Condition-Index Map for {len(CONDITION_IDX_MAP)} conditions.")
print(f"Map saved to: {OUTPUT_MAP_PATH}")

# [Optional] Print sample to show structure
print("\nSample Map:", dict(list(CONDITION_IDX_MAP.items())[:5]))

Successfully mapped 66 conditions.
Entries requiring manual clinical review: 22

--- AUTO-GENERATED ICD MAPPING (66 Conditions) ---
{'Abrasion, scrape, or scab': 'K03.1', 'Abscess': 'K61.0', 'Acne': 'L70.8', 'Actinic Keratosis': 'L57.0', 'Acute and chronic dermatitis': 'NOT_FOUND', 'Acute dermatitis, NOS': 'NOT_FOUND', 'Allergic Contact Dermatitis': 'L23.4', 'Basal Cell Carcinoma': 'C44.510', 'Bullous Pemphigoid': 'L12.0', 'CD - Contact dermatitis': 'Q93.52', 'Cellulitis': 'L03.211', 'Chronic dermatitis, NOS': 'NOT_FOUND', 'Cutaneous lupus': 'L93.1', 'Dermatofibroma': 'NOT_FOUND', 'Drug Rash': 'D72.12', 'Ecthyma': 'NOT_FOUND', 'Eczema': 'L20.82', 'Erythema multiforme': 'L51.8', 'Folliculitis': 'L66.2', 'Granuloma annulare': 'L92.0', 'Herpes Simplex': 'B00.82', 'Herpes Zoster': 'B02.39', 'Hyperpigmentation': 'L81.4', 'Hypersensitivity': 'M31.0', 'Impetigo': 'L01.09', 'Infected eczema': 'NOT_FOUND', 'Inflicted skin lesions': 'NOT_FOUND', 'Insect Bite': 'S00.561S', 'Intertrigo': 'L30.4', 

**C. Code: Calculate the Dissimilarity Matrix ($\mathbf{D}$)**

In [2]:
import pandas as pd
import numpy as np
import torch
from typing import Dict, List

# --- DEPENDENCY: USER_ICD_MAP must be defined (see above) ---

def calculate_icd_dissimilarity_matrix(icd_map: Dict[str, str]):
    """
    Creates the N x N Disssimilarity Matrix D based on ICD code prefix length.
    D_i,j is close to 0 (similar) or 1 (dissimilar).
    """
    conditions = sorted(icd_map.keys())
    N = len(conditions)
    D = np.eye(N, dtype=np.float32) # Start with 0 distance on diagonal

    for i in range(N):
        for j in range(i + 1, N):
            code_i = icd_map[conditions[i]].replace('.', '')
            code_j = icd_map[conditions[j]].replace('.', '')
            
            # Find the longest common prefix (LCP)
            lcp_length = 0
            for k in range(min(len(code_i), len(code_j))):
                if code_i[k] == code_j[k]:
                    lcp_length += 1
                else:
                    break
            
            # Distance is scaled by LCP length (deeper shared prefix = smaller distance)
            # Max possible LCP length is limited by the shorter code.
            max_len = min(len(code_i), len(code_j))
            
            # Dissimilarity: 1 - (Normalized Similarity)
            # We use 1 - LCP_length / Max_len
            dissimilarity = 1.0 - (lcp_length / max_len)
            
            D[i, j] = dissimilarity
            D[j, i] = dissimilarity

    # Create Index Mapping for later use
    condition_to_idx = {cond: i for i, cond in enumerate(conditions)}
    
    # Return PyTorch Tensor
    return torch.tensor(D, dtype=torch.float32), condition_to_idx

# --- Execution ---
ICD_MATRIX_TENSOR, CONDITION_IDX_MAP = calculate_icd_dissimilarity_matrix(USER_ICD_MAP)
torch.save(ICD_MATRIX_TENSOR, './data/icd_dissimilarity_matrix.pt')
print("ICD Matrix size:", ICD_MATRIX_TENSOR.shape)

ICD Matrix size: torch.Size([66, 66])
