## Setting up

In [1]:
!pip install pymysql sqlalchemy pandas dotenv tqdm

Collecting pymysql
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, pymysql, dotenv
Successfully installed dotenv-0.9.9 pymysql-1.1.1 python-dotenv-1.1.0


In [176]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text, event

load_dotenv(override=True)

def validate_env_vars(required: dict, context: str = ""):
    for var_name, value in required.items():
        if not value:
            raise ValueError(f"[{context}] The environment variable {var_name} is not defined!")

# Production environment variables
PRODUCTION_DB_USER = os.getenv("CIEL_PROD_DB_USER")
PRODUCTION_DB_PASSWORD = os.getenv("CIEL_PROD_DB_PASS")
PRODUCTION_DB_HOST = os.getenv("CIEL_PROD_DB_HOST")
PRODUCTION_DB_PORT = os.getenv("CIEL_PROD_DB_PORT")
PRODUCTION_DB_NAME = os.getenv("CIEL_PROD_DB_NAME")
PRODUCTION_ANALYTICS_DB_NAME = os.getenv("CIEL_PROD_ANALYTICS_DB_NAME")

validate_env_vars({
    "CIEL_PROD_DB_USER": PRODUCTION_DB_USER,
    "CIEL_PROD_DB_PASS": PRODUCTION_DB_PASSWORD,
    "CIEL_PROD_DB_HOST": PRODUCTION_DB_HOST,
    "CIEL_PROD_DB_PORT": PRODUCTION_DB_PORT,
    "CIEL_PROD_DB_NAME": PRODUCTION_DB_NAME,
    "CIEL_PROD_ANALYTICS_DB_NAME": PRODUCTION_ANALYTICS_DB_NAME,
}, context="PRODUCTION")

PRODUCTION_DATABASE_URL = f"mysql+pymysql://{PRODUCTION_DB_USER}:{PRODUCTION_DB_PASSWORD}@{PRODUCTION_DB_HOST}:{PRODUCTION_DB_PORT}/{PRODUCTION_DB_NAME}"
PRODUCTION_ANALYTICS_DATABASE_URL = f"mysql+pymysql://{PRODUCTION_DB_USER}:{PRODUCTION_DB_PASSWORD}@{PRODUCTION_DB_HOST}:{PRODUCTION_DB_PORT}/{PRODUCTION_ANALYTICS_DB_NAME}"

# Development environment variables (corrected)
DEVELOPMENT_DB_USER = os.getenv("CIEL_DEV_DB_USER")
DEVELOPMENT_DB_PASSWORD = os.getenv("CIEL_DEV_DB_PASS")
DEVELOPMENT_DB_HOST = os.getenv("CIEL_DEV_DB_HOST")
DEVELOPMENT_DB_PORT = os.getenv("CIEL_DEV_DB_PORT")
DEVELOPMENT_DB_NAME = os.getenv("CIEL_DEV_DB_NAME")

validate_env_vars({
    "CIEL_DEV_DB_USER": DEVELOPMENT_DB_USER,
    "CIEL_DEV_DB_PASS": DEVELOPMENT_DB_PASSWORD,
    "CIEL_DEV_DB_HOST": DEVELOPMENT_DB_HOST,
    "CIEL_DEV_DB_PORT": DEVELOPMENT_DB_PORT,
    "CIEL_DEV_DB_NAME": DEVELOPMENT_DB_NAME,
}, context="DEVELOPMENT")

DEVELOPMENT_DATABASE_URL = f"mysql+pymysql://{DEVELOPMENT_DB_USER}:{DEVELOPMENT_DB_PASSWORD}@{DEVELOPMENT_DB_HOST}:{DEVELOPMENT_DB_PORT}/{DEVELOPMENT_DB_NAME}"

# Create SQLAlchemy engines
try:
    print("🔄 Creating database connections...")
    production_engine = create_engine(PRODUCTION_DATABASE_URL)
    production_analytics_engine = create_engine(PRODUCTION_ANALYTICS_DATABASE_URL)
    development_engine = create_engine(DEVELOPMENT_DATABASE_URL, pool_pre_ping=True)
    print("✅ Database connections successfully established.")
except Exception as e:
    print(f"❌ Error creating database engines: {e}")
    raise

# Function to test database connectivity
def test_connection(engine, db_name, db_port):
    try:
        with engine.connect() as connection:
            result = connection.execute(text("SELECT DATABASE();"))
            print(result)
            print(f"✅ Successfully connected to {db_name}:{db_port}: {result.fetchone()[0]}")
    except Exception as e:
        print(f"❌ Failed to connect to {db_name}: {e}")

# Test all connections
test_connection(development_engine, DEVELOPMENT_DB_NAME, DEVELOPMENT_DB_PORT)
test_connection(production_engine, PRODUCTION_DB_NAME, PRODUCTION_DB_PORT)
test_connection(production_analytics_engine, PRODUCTION_ANALYTICS_DB_NAME, PRODUCTION_DB_PORT)

# Event listener to block write operations
@event.listens_for(production_engine, "before_cursor_execute")
def prevent_writes(conn, cursor, statement, parameters, context, executemany):
    lowered = statement.strip().lower()
    if lowered.startswith(("insert", "update", "delete", "create", "drop", "alter", "truncate", "replace")):
        raise Exception(f"❌ [READ-ONLY] Write operation blocked: `{statement.split()[0].upper()}`")

# Create alias
engine = production_analytics_engine

🔄 Creating database connections...
✅ Database connections successfully established.
<sqlalchemy.engine.cursor.CursorResult object at 0x7b9e4ebf7380>
✅ Successfully connected to snapshot_20250405:3306: snapshot_20250405
<sqlalchemy.engine.cursor.CursorResult object at 0x7b9e4ebf73f0>
✅ Successfully connected to openmrs:3306: openmrs
<sqlalchemy.engine.cursor.CursorResult object at 0x7b9e4ebf7150>
✅ Successfully connected to analytics:3306: analytics


## Generate Dataset V1

In [None]:
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

# 📂 Carregar variáveis de ambiente
load_dotenv(override=True)

# 📦 Ler apenas o necessário para conectar no banco analytics
DB_USER = os.getenv("CIEL_PROD_DB_USER")
DB_PASS = os.getenv("CIEL_PROD_DB_PASS")
DB_HOST = os.getenv("CIEL_PROD_DB_HOST")
DB_PORT = os.getenv("CIEL_PROD_DB_PORT")
DB_NAME = os.getenv("CIEL_PROD_ANALYTICS_DB_NAME")

# 🛡️ Validação básica
for var_name, value in {
    "CIEL_PROD_DB_USER": DB_USER,
    "CIEL_PROD_DB_PASS": DB_PASS,
    "CIEL_PROD_DB_HOST": DB_HOST,
    "CIEL_PROD_DB_PORT": DB_PORT,
    "CIEL_PROD_ANALYTICS_DB_NAME": DB_NAME,
}.items():
    if not value:
        raise ValueError(f"Environment variable {var_name} is missing!")

# 🛜 Criar conexão SQLAlchemy
DATABASE_URL = f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

engine = create_engine(DATABASE_URL, pool_pre_ping=True)

# ✅ Testar conexão
def test_connection(engine):
    with engine.connect() as conn:
        result = conn.execute(text("SELECT DATABASE();")).scalar()
        print(f"✅ Connected to database: {result}")

test_connection(engine)

In [None]:
import pandas as pd
import random
import json
from tqdm import tqdm
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

# 📂 1. Carregar variáveis de ambiente e conectar no banco
load_dotenv(override=True)

DB_USER = os.getenv("CIEL_PROD_DB_USER")
DB_PASS = os.getenv("CIEL_PROD_DB_PASS")
DB_HOST = os.getenv("CIEL_PROD_DB_HOST")
DB_PORT = os.getenv("CIEL_PROD_DB_PORT")
DB_NAME = os.getenv("CIEL_PROD_ANALYTICS_DB_NAME")

DATABASE_URL = f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL, pool_pre_ping=True)

# 🎯 2. Definir instruções possíveis
instructions_stem = [
    "Provide the ICD-11 code for the clinical concept. Include extensions or cluster codes if required for full specificity.",
    "Find the ICD-11 code corresponding to the clinical concept. Use extensions or clusters if necessary.",
    "Map the clinical concept to its ICD-11 code. Add extensions or cluster codes if needed.",
    "Assign the ICD-11 code to the following concept, considering extensions or cluster codes where appropriate."
]

instructions_extension = [
    "Provide the ICD-11 extension code corresponding to the specified extension concept.",
    "Identify the ICD-11 extension code for the following extension concept.",
    "Find the corresponding ICD-11 extension code for this modifier."
]

# 📦 3. Inicializar dataset
dataset = []

# 📚 4. Ler e processar SimpleTabulation-ICD-11-MMS-en.xlsx
file_path = '../assets/SimpleTabulation-ICD-11-MMS-en.xlsx'
df = pd.read_excel(file_path)

df = df[
    (df['isLeaf'] == True) &
    (df['Code'].notnull()) &
    (df['ClassKind'] == 'category')
]

print(f"📄 Processando {len(df)} linhas da planilha...")

for _, row in tqdm(df.iterrows(), total=len(df)):
    code = row['Code']
    title = str(row['Title']).lstrip('-–— ').strip()
    chapter = row['ChapterNo']

    if code.startswith('X') and chapter == 'X':
        category = 'source:icdapi>extension'
        instruction = random.choice(instructions_extension)
    else:
        category = 'source:icdapi>same-as+stem'
        instruction = random.choice(instructions_stem)

    entry = {
        "instruction": instruction,
        "input": title,
        "output": f"SAME-AS {code}",
        "category": category
    }

    dataset.append(entry)

# 🛜 5. Ler e processar dados da view vw_ciel_diagnosis_n_findings_to_icd11
query = text("""
    SELECT concept_id, fsn, map_type, icd11_code
    FROM analytics.vw_ciel_diagnosis_n_findings_to_icd11
""")

with engine.connect() as conn:
    results = conn.execute(query).mappings().all()

print(f"🛢️ Processando {len(results)} linhas da view...")

for row in tqdm(results):
    concept_id = row['concept_id']
    fsn = row['fsn']
    map_type = row['map_type']
    icd11_code = row['icd11_code']

    if icd11_code.startswith('X'):
        continue

    category = None

    if '&' not in icd11_code and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem'

    elif icd11_code.count('&') == 1 and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem+extension'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem+extension'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem+extension'

    elif icd11_code.count('&') > 1 and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem+extensions'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem+extensions'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem+extensions'

    elif '&' not in icd11_code and '/' in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+cluster'

    elif '&' in icd11_code and '/' in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+cluster+extension'

    if category is None:
        continue

    entry = {
        "instruction": random.choice(instructions_stem),
        "input": fsn,
        "output": f"{map_type} {icd11_code}",
        "category": category
    }

    dataset.append(entry)

# 💾 6. Exportar JSONL unificado
output_path = '../assets/ciel-icd11-unified.jsonl'
with open(output_path, 'w', encoding='utf-8') as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"✅ Dataset final salvo em {output_path} com {len(dataset)} exemplos.")

### Testing (QA)

In [180]:
import pandas as pd
from collections import Counter
import json

# 📂 Caminho para o seu arquivo
file_path = '../assets/ciel-icd11-unified.jsonl'

# 📦 Carregar dataset
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# 🧠 Converter para DataFrame
df = pd.DataFrame(data)

# 📊 Contar categorias
category_counts = Counter(df['category'])
instruction_counts = Counter(df['instruction'])

# 📋 Mostrar resumo
print(f"🔢 Total de linhas: {len(df)}")
print(f"\n📚 Categorias únicas: {len(category_counts)}")
for cat, count in category_counts.items():
    print(f"  • {cat}: {count} exemplos")

print(f"\n📝 Instruções únicas: {len(instruction_counts)}")
for instr, count in instruction_counts.items():
    print(f"  • {instr[:50]}... ({count} exemplos)")

🔢 Total de linhas: 51838

📚 Categorias únicas: 13
  • source:icdapi>same-as+stem: 16019 exemplos
  • source:icdapi>extension: 14947 exemplos
  • source:ciel>broader-than+stem: 1136 exemplos
  • source:ciel>same-as+stem: 4965 exemplos
  • source:ciel>narrower-than+stem: 8578 exemplos
  • source:ciel>narrower-than+stem+extension: 2910 exemplos
  • source:ciel>same-as+stem+extension: 2283 exemplos
  • source:ciel>broader-than+stem+extension: 337 exemplos
  • source:ciel>same-as+cluster: 280 exemplos
  • source:ciel>same-as+cluster+extension: 115 exemplos
  • source:ciel>narrower-than+stem+extensions: 62 exemplos
  • source:ciel>same-as+stem+extensions: 198 exemplos
  • source:ciel>broader-than+stem+extensions: 8 exemplos

📝 Instruções únicas: 7
  • Map the clinical concept to its ICD-11 code. Add e... (9302 exemplos)
  • Assign the ICD-11 code to the following concept, c... (9238 exemplos)
  • Find the ICD-11 code corresponding to the clinical... (9203 exemplos)
  • Provide the ICD-11 cod

In [None]:
import random
import json
from collections import defaultdict

# 📂 Carregar o dataset
input_path = '../assets/ciel-icd11-unified.jsonl'
with open(input_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# 🗂️ Separar base icdapi e ciel
base_data = []
ciel_data = defaultdict(list)  # agora agrupamos ciel por categoria

for item in data:
    if item['category'].startswith('source:icdapi'):
        base_data.append(item)
    else:
        ciel_data[item['category']].append(item)

# 📦 Preparar listas de treino e validação
train_data = base_data.copy()
val_data = []

# 🎯 Balancear cada categoria CIEL 9:1
for category, items in ciel_data.items():
    random.shuffle(items)
    split_idx = int(len(items) * 0.9)
    train_data.extend(items[:split_idx])
    val_data.extend(items[split_idx:])

# 💾 Salvar
with open('../assets/ciel-icd11-train.jsonl', 'w', encoding='utf-8') as f:
    for item in train_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

with open('../assets/ciel-icd11-validation.jsonl', 'w', encoding='utf-8') as f:
    for item in val_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"✅ Treino: {len(train_data)} exemplos")
print(f"✅ Validação: {len(val_data)} exemplos")
print(f"🔵 Base icdapi no treino: {len(base_data)} exemplos")
for cat, items in ciel_data.items():
    print(f"📚 Categoria {cat}: {len(items)} exemplos (9:1 aplicado)")

axolotl.yml
```yml
base_model: mistralai/Ministral-8B-Instruct-2410
model_type: mistral
tokenizer_type: mistral

load_in_8bit: false
load_in_4bit: true
bnb_4bit_compute_dtype: bfloat16
bnb_4bit_use_double_quant: true
bnb_4bit_quant_type: nf4
trust_remote_code: true

datasets:
  - path: ../assets/ciel-icd11.jsonl
    type: completion
    field_instruction: instruction
    field_input: input
    field_output: output
    field_category: category
    eval_split: 0.1

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./axolotl-out-ministral-8b-ciel
logging_steps: 20
save_steps: 200
save_total_limit: 2
early_stopping: true
early_stopping_patience: 3

adapter: lora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 4096
sample_packing: true
train_on_inputs: false
group_by_length: true

optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 2e-4
warmup_steps: 50

gradient_checkpointing: true
gradient_accumulation_steps: 4
micro_batch_size: 16
num_epochs: 3

weight_decay: 0.01
bf16: true
tf32: true
flash_attention: true
```

## Generate Dataset V2

- [ ] Create Source database - create auxiliar tables on postgres from ICD API
- [ ] Create Source database - Create vector database in Qdrant
- [ ] Prompt Enginniering - Add context to input using top-7 retrieval  
  1. Query vector database for candidate matches  
  2. Check cosine distance of matches — if greater than 0.3, post-coordination is usually needed (to study more about it)  
  3. Evaluate metadata: for each detected stem code, filter top-3 matching codes from its recommended post-coordination options  
- [ ] Prompt Enginniering - Add reasoning line to the output  
- [ ] Weightning - Add more weight to FSN terms and non-SAME-AS entries to balance the dataset samples
- [ ] Adicionar paráfrases com metade do peso


### Preparing JSON for sending to vectorization

In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm

# 1. Carregar dataframes
# icd11_df já deve estar carregado com colunas: id, title, code, is_leaf, entity_id_residual

print("Carregando icd11_mms_en_name...")
names_df = pd.read_sql(
    "SELECT icd11_mms_en_id, name, name_type FROM analytics.icd11_mms_en_name",
    engine
)

print("Carregando icd11_mms_en_postcoordination...")
postcoord_df = pd.read_sql("""
    SELECT icd11_mms_en_id, code, code_type, title
    FROM analytics.icd11_mms_en_postcoordination
""", engine)

# 2. Juntar is_leaf, entity_id_residual para cada code
print("Criando dataset postcoord_with_more...")
postcoord_with_more = (
    postcoord_df
    .merge(
        icd11_df[['code', 'is_leaf', 'entity_id_residual']],
        on='code',
        how='left'
    )
)

# 3. Agrupar postcoordinação por stem_id para acesso rápido
postcoord_grouped = postcoord_with_more.groupby('icd11_mms_en_id')

# 4. Montar lista de objetos JSON
vector_input = []
for _, row in tqdm(icd11_df.iterrows(), total=len(icd11_df), desc="Gerando JSON"):
    cid = row['id']
    title = row['title']
    code = row['code']
    is_leaf = bool(row['is_leaf'])
    entity_id_residual = row['entity_id_residual']
    
    if not code:
        code_type = "foundation"
    elif str(code).startswith("X"):
        code_type = "extension"
    else:
        code_type = "stem"

    metadata = {
        "code": code,
        "entity_id_residual": entity_id_residual,
        "code_type": code_type,
        "name_type": "fsn",
        "is_leaf": is_leaf
    }

    opts = postcoord_grouped.get_group(cid) if cid in postcoord_grouped.groups else pd.DataFrame()

    metadata["leaf_postcoordination_options"] = [
        {
            "code": opt_row['code'],
            "code_type": opt_row['code_type'],
            "title": opt_row['title'],
            "is_leaf": bool(opt_row['is_leaf']),
            "entity_id_residual": opt_row['entity_id_residual']
        }
        for _, opt_row in opts.iterrows() if pd.notnull(opt_row['code'])
    ]

    vector_input.append({
        "concept_name": title,
        "metadata": metadata
    })

# 5. Salvar em JSON
with open("icd11_vector_input.json", "w", encoding="utf-8") as f:
    json.dump(vector_input, f, ensure_ascii=False, indent=2)

print(f"✅ Gerado icd11_vector_input.json com {len(vector_input)} conceitos.")

### Generatins Preliminar dataset 

In [13]:
import pandas as pd
import json
from tqdm.notebook import tqdm

# --- 1. Carregar os capítulos ---
icd11_chapters = {
    "01": "Certain infectious or parasitic diseases",
    "02": "Neoplasms",
    "03": "Diseases of the blood or blood-forming organs",
    "04": "Diseases of the immune system",
    "05": "Endocrine, nutritional or metabolic diseases",
    "06": "Mental, behavioural or neurodevelopmental disorders",
    "07": "Sleep-wake disorders",
    "08": "Diseases of the nervous system",
    "09": "Diseases of the visual system",
    "10": "Diseases of the ear or mastoid process",
    "11": "Diseases of the circulatory system",
    "12": "Diseases of the respiratory system",
    "13": "Diseases of the digestive system",
    "14": "Diseases of the skin",
    "15": "Diseases of the musculoskeletal system or connective tissue",
    "16": "Diseases of the genitourinary system",
    "17": "Conditions related to sexual health",
    "18": "Pregnancy, childbirth or the puerperium",
    "19": "Certain conditions originating in the perinatal period",
    "20": "Developmental anomalies",
    "21": "Symptoms, signs or clinical findings, not elsewhere classified",
    "22": "Injury, poisoning or certain other consequences of external causes",
    "23": "External causes of morbidity or mortality",
    "24": "Factors influencing health status or contact with health services",
    "25": "Codes for special purposes",
    "26": "Supplementary Chapter Traditional Medicine Conditions",
    "X":  "Extension Codes"
}

# --- 2. Carregar DataFrames necessários ---
print("Catalogando nomes...")
names_df = pd.read_sql(
    "SELECT icd11_mms_en_id, name, name_type FROM analytics.icd11_mms_en_name",
    engine
)

print("Catalogando opções de pós coordenação...")
postcoord_df = pd.read_sql("""
    SELECT icd11_mms_en_id, code, code_type, title, required
    FROM analytics.icd11_mms_en_postcoordination
""", engine)

print("Catalogando parentesco...")
# flat_hierarchy: Para buscar parents em cadeia
flat = pd.read_csv("../assets/icd_flat_hierarchy.csv", dtype=str).fillna("")
flat = flat[['entity_id', 'code', 'title', 'parent_entity_id']]  # garantir colunas corretas

print("Catalogando blocos/categorias...")
# --- 3. Preprocessar mapeamentos auxiliares para blocks e parents ---
# Lookup para blocks a partir de icd11_df
block_id_to_title = dict(zip(icd11_df['block_id'], icd11_df['title']))

# Para blocks (groupings): extrai todos os groupingN de cada linha
def get_blocks(row):
    blocks = []
    for n in range(1, 6):
        grp_code = row.get(f'grouping{n}')
        if pd.notnull(grp_code) and grp_code:
            # Pega título do bloco (pode ser nulo)
            grp_title = icd11_df.loc[icd11_df['block_id'] == grp_code, 'title'].values
            blocks.append({
                "code": grp_code,
                "title": grp_title[0] if len(grp_title) > 0 else None
            })
    return blocks

def get_entity_id_base(entity_id_residual):
    """
    Retorna apenas a parte numérica antes de '/' em entity_id_residual.
    Ex: '12345678/other' -> '12345678', '9876543' -> '9876543'
    """
    if pd.isnull(entity_id_residual):
        return ""
    return str(entity_id_residual).split('/')[0]
    
# Para parents (ancestrais): busca recursivamente em flat_hierarchy
def get_parents(entity_id_residual):
    """
    Busca ancestrais na flat hierarchy com base no entity_id_residual base.
    Retorna uma lista de dicionários: code, title, entity_id_residual
    """
    parents = []
    eid = str(entity_id_residual).split('/')[0]
    while True:
        row = flat.loc[flat['entity_id'] == eid]
        if row.empty:
            break
        parent_id = row['parent_entity_id'].values[0]
        parent_code = row['code'].values[0]
        parent_title = row['title'].values[0]
        if parent_id and parent_id != '' and parent_id != eid:
            # Busca info do pai direto (parent_id)
            parent_row = flat.loc[flat['entity_id'] == parent_id]
            if not parent_row.empty:
                parents.append({
                    "code": parent_row['code'].values[0],
                    "title": parent_row['title'].values[0],
                    "entity_id_residual": parent_id
                })
            eid = parent_id  # sobe para o próximo ancestral
        else:
            break
    return parents

print("Adicionando dados ao dataset de pós coordenação...")
# --- 4. Agrupamento de pós-coordenação por conceito ---
postcoord_with_more = (
    postcoord_df
    .merge(
        icd11_df[['code', 'is_leaf', 'entity_id_residual']],
        on='code',
        how='left'
    )
)
postcoord_grouped = postcoord_with_more.groupby('icd11_mms_en_id')

# --- 5. Names: map id → synonyms (lista) ---
print("Catalogando criando mapa de sinônimos...")
synonyms_map = names_df[names_df['name_type'] == 'synonym'] \
    .groupby('icd11_mms_en_id')['name'].apply(list).to_dict()

Catalogando nomes...
Catalogando opções de pós coordenação...
Catalogando parentesco...
Catalogando blocos/categorias...
Adicionando dados ao dataset de pós coordenação...
Catalogando criando mapa de sinônimos...


In [14]:
from tqdm.notebook import tqdm

# --- 6. Montar o JSON final ---

vector_input = []
for _, row in tqdm(icd11_df.iterrows(), total=len(icd11_df), desc="Gerando JSON"):
    cid = row['id']
    title = row['title']
    code = row['code']
    is_leaf = bool(row['is_leaf'])
    entity_id_residual = row['entity_id_residual']
    chapter_no = row.get('chapter_no')

    # code_type
    if not code:
        code_type = "foundation"
    elif str(code).startswith("X"):
        code_type = "extension"
    else:
        code_type = "stem"

    # chapters
    chapters = []
    if pd.notnull(chapter_no):
        for ch in str(chapter_no).replace(',', ';').replace(' ', ';').split(';'):
            ch = ch.strip()
            if ch in icd11_chapters:
                chapters.append({
                    "code": ch,
                    "title": icd11_chapters[ch]
                })

    # blocks
    blocks = get_blocks(row)

    # parents
    parents = get_parents(get_entity_id_base(entity_id_residual))

    # leaf_related_codes
    opts = postcoord_grouped.get_group(cid) if cid in postcoord_grouped.groups else pd.DataFrame()
    leaf_related_codes = [opt_row['code'] for _, opt_row in opts.iterrows() if pd.notnull(opt_row['code'])]

    # has_required_postcoordination
    has_required_postcoordination = opts['required'].any() if not opts.empty and 'required' in opts else False

    # --- 6a. Registro FSN principal ---
    metadata_fsn = {
        "code": code,
        "entity_id_residual": entity_id_residual,
        "code_type": code_type,
        "name_type": "fsn",
        "is_leaf": is_leaf,
        "chapter": chapters,
        "blocks": blocks,
        "parents": parents,
        "has_required_postcoordination": bool(has_required_postcoordination),
        "leaf_related_codes": leaf_related_codes
    }
    vector_input.append({
        "concept_name": title,
        "metadata": metadata_fsn
    })

    # --- 6b. Registros de synonym (um para cada synônimo) ---
    synonyms = synonyms_map.get(cid, [])
    for syn in synonyms:
        metadata_syn = metadata_fsn.copy()
        metadata_syn["name_type"] = "synonym"
        vector_input.append({
            "concept_name": syn,
            "metadata": metadata_syn
        })

# --- 7. Salvar em JSON ---
with open("icd11_vector_input.json", "w", encoding="utf-8") as f:
    json.dump(vector_input, f, ensure_ascii=False, indent=2)

print(f"✅ Gerado icd11_vector_input.json com {len(vector_input)} conceitos.")

Gerando JSON:   0%|          | 0/36782 [00:00<?, ?it/s]

✅ Gerado icd11_vector_input.json com 125885 conceitos.


### Testing Preliminar dataset (QA)

In [15]:
import json
from collections import Counter

# 1. Carregar JSON
with open("icd11_vector_input.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Quantos conceitos is_leaf == False
not_leaf = [d for d in data if not d['metadata'].get('is_leaf', True)]
print(f"Total de conceitos is_leaf == False: {len(not_leaf)}")

# 3. Contagem geral por tipo do conceito principal
root_type_counts = Counter(d['metadata'].get('code_type', 'MISSING') for d in data)
print("\nDistribuição de code_type (nível raiz dos conceitos):")
for k, v in root_type_counts.items():
    print(f"{k}: {v}")

# 4. Verificar inconsistência entre code_type e o prefixo do code
mismatch = [
    d for d in data
    if d['metadata'].get('code_type') == 'stem' and str(d['metadata'].get('code', '')).startswith('X')
]
print(f"\nTotal de códigos marcados como 'stem' mas começam com 'X': {len(mismatch)}")
print("\nExemplos:")
for item in mismatch[:10]:
    print(f"{item['metadata']['code']}: {item['concept_name']}")

# 5. Top 10 conceitos com mais opções de pós-coordenação (agora via leaf_related_codes)
postcoord_counts = [(d['concept_name'], len(d['metadata'].get('leaf_related_codes', [])))
                    for d in data]
top_10_postcoord = sorted(postcoord_counts, key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 conceitos com mais opções de pós-coordenação:")
for name, count in top_10_postcoord:
    print(f"{name}: {count} opções")

# 6. Quantos conceitos têm has_required_postcoordination == True
required_count = sum(1 for d in data if d['metadata'].get('has_required_postcoordination', False))
print(f"\nTotal de conceitos com has_required_postcoordination == True: {required_count}")

# 7. Contar o total de sinônimos (name_type == "synonym")
synonym_count = sum(1 for d in data if d['metadata'].get('name_type') == 'synonym')
print(f"\nTotal de sinônimos: {synonym_count}")

# 8. Conceitos com mais de um parent
multi_parent = [(d['concept_name'], len(d['metadata'].get('parents', [])))
                for d in data if len(d['metadata'].get('parents', [])) > 1]
multi_parent_sorted = sorted(multi_parent, key=lambda x: x[1], reverse=True)

print(f"\nTotal de conceitos com mais de um parent: {len(multi_parent)}")
print("Top 10 conceitos com mais parents:")
for name, count in multi_parent_sorted[:10]:
    print(f"{name}: {count} parents")

Total de conceitos is_leaf == False: 7244

Distribuição de code_type (nível raiz dos conceitos):
foundation: 1443
stem: 101352
extension: 23090

Total de códigos marcados como 'stem' mas começam com 'X': 0

Exemplos:

Top 10 conceitos com mais opções de pós-coordenação:
Intentional self-harm by water transport injury event with water vessel not damaged, disabled or destroyed causing submersion or drowning: 8152 opções
Intentional self-harm by water transport injury event with water vessel not damaged, disabled or destroyed causing other injury: 8152 opções
Assault by causing a fall or jump on same level or from less than 1 metre: 8014 opções
Assault by causing a fall on ice or snow: 8014 opções
Assault by causing a fall on other specified same level: 8014 opções
Assault by causing a fall on pedestrian conveyance: 8014 opções
Assault by causing a fall on unspecified same level: 8014 opções
Assault by causing a fall or jump from a height of 1 metre or more: 8014 opções
Assault by causing

In [9]:
from pprint import pprint

vector_input = json.load(open("icd11_vector_input.json", "r", encoding="utf-8"))

# Seleciona 3 itens com code_type == 'extension'
ext_items = [item for item in vector_input if item['metadata'].get('code_type') == 'extension'][:3]
pprint(ext_items)

[{'concept_name': 'None',
  'metadata': {'blocks': [],
               'chapter': [{'code': 'X', 'title': 'Extension Codes'}],
               'code': 'XS8H',
               'code_type': 'extension',
               'entity_id_residual': '761934509',
               'has_required_postcoordination': False,
               'is_leaf': True,
               'leaf_related_codes': [],
               'name_type': 'fsn',
               'parents': [{'code': '',
                            'entity_id_residual': '1726150948',
                            'title': 'Mild Moderate Severe Scale Value'},
                           {'code': '',
                            'entity_id_residual': '146631584',
                            'title': 'Generic Severity Scale Value'},
                           {'code': '',
                            'entity_id_residual': '815889539',
                            'title': 'Severity Scale Value'},
                           {'code': 'X',
                            'ent

### Creating Alpaca format dataset

#### 1. Settings

In [4]:
!pip install qdrant-client[fastembed] torch torchvision transformers sentence-transformers

Collecting torch
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting qdrant-client[fastembed]
  Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting fastembed==0.6.1 (from qdrant-client[fastembed])
  Downloading fastembed-0.6.1-py3-none-any.whl.metadata (10 kB)
Collecting grpcio>=1.41.0 (from qdrant-client[fastembed])
  Downloading grpcio-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting httpx>=0.20.0 (from httpx[http2]>=0.20.0->qdrant-client[fastembed])
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client

In [1]:
import torch
from sentence_transformers import SentenceTransformer

col_name_map = {
    "icd11_concepts_minilm": "sentence-transformers/all-MiniLM-L6-v2",
    "icd11_concepts_mpnet": "sentence-transformers/all-mpnet-base-v2",
    "icd11_concepts_biobert": "pritamdeka/S-BioBert-snli-multinli-stsb"
}

# --- CONFIGS ---
QDRANT_HOST = "qdrant.filipelopes.me"
QDRANT_PORT = 80
COLLECTION_NAME = "icd11_concepts_mpnet"
EMBEDDING_MODEL = col_name_map[COLLECTION_NAME]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer(EMBEDDING_MODEL, device=device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
"""
Dataset v2 – ICD-11 mapping in Alpaca format
===========================================

• Obtém conceitos (FSN + sinônimos) do JSON vetorial.
• Busca candidatos no Qdrant (top-7) + dados relacionais.
• Gera contexto, linha de raciocínio (<scratchpad>) e saída final.
• Produz JSONL pronto para fine-tuning.

Pré-requisitos:
---------------
pip install qdrant-client tqdm sqlalchemy pymysql python-dotenv
Variáveis de ambiente: DB_*, QDRANT_HOST, QDRANT_PORT
"""

import os, json, random, copy
from collections import defaultdict
from dotenv import load_dotenv
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import SearchRequest, Filter
from sqlalchemy import create_engine, text

# ╭───────────────────────── 1. CONEXÕES ─────────────────────────╮
load_dotenv(override=True)

# sqlachemy engine was already create on previous blocks

QDRANT_HOST="qdrant.filipelopes.me"
QDRANT_PORT=80

qdrant = QdrantClient(
    host=QDRANT_HOST,
    port=QDRANT_PORT
)

#### 2. Helpers functions

In [174]:
import re

# ╭───────────────────────── 2. FUNÇÕES DE APOIO ─────────────────────────╮

def dedup_code_results(results, prefer_name_type="fsn"):
    """
    Remove duplicados de código, mantendo apenas um por code.
    Prioriza registros com name_type == prefer_name_type.
    """
    filtered = {}
    for r in results:
        code = (r.payload.get('code') or '').strip()
        name_type = r.payload.get('name_type')
        # Se nunca adicionado ou o prefer_name_type está presente e o já salvo não é preferido
        if code not in filtered:
            filtered[code] = r
        else:
            already = filtered[code].payload.get('name_type')
            # Se prefer_name_type é encontrado e já salvo não é preferido, substitui
            if name_type == prefer_name_type and already != prefer_name_type:
                filtered[code] = r
            # Se ambos têm o mesmo prefer_name_type, mantém o primeiro (poderia escolher pelo maior score)
    
    return list(filtered.values())

def search_candidates(text: str, top_k: int = 7):
    """
    Consulta Qdrant e devolve lista [(payload, distance), …], sem códigos duplicados (prioriza fsn).
    """
    res = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=get_embedding(text),
        limit=top_k,
        with_payload=True,
        query_filter=Filter(
            must=[
                {"key": "code_type", "match": {"value": "stem"}}
            ]
        ),
    )
    # Remove duplicados de code antes de retornar, priorizando fsn
    deduped = dedup_code_results(res.points)
    # Só retorna caso o score seja maior que 0.8
    return [(r.payload, r.score) for r in deduped if r.score > 0.8]


def needs_postcoord(code:str) -> bool:
    # retorna true caso concept_name contenha & ou /
    return "&" in code or "/" in code


def fetch_postcoord_options(concept: str, candidates: list[dict]) -> list[dict]:
    """Retorna opções de pós-coordenação relevantes a partir do qdrant, filtrando pelos códigos dos candidatos e lendo o metadado leaf_related_code"""
    # Extrai todos os códigos de leaf_related_codes (que é uma lista de strings por candidato)
    # Extrai e agrega todos os códigos de leaf_related_codes dos candidatos
    related_codes = set(code for c, _ in candidates for code in c.get("leaf_related_codes", []))

    res = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=get_embedding(concept),
        limit=10,
        with_payload=True,
        query_filter=Filter(
            must=[
                {"key": "code", "match": {"any": related_codes}}
            ]
        ),
    )
    # retona o title e code dos top 10
    return [{"code": r.payload["code"], "title": r.payload["concept_name"]} for r in res.points[:10]]

def get_title_by_code(code: str) -> str:
    """
    Consulta rápida ao Qdrant para obter 'concept_name' pelo código exato.
    Se não encontrar, devolve o próprio código.
    """
    res = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        with_payload=True,
        limit=1,
        query_filter=Filter(must=[{"key": "code", "match": {"value": code}}, {"key": "name_type", "match": {"value": "fsn"}}]),
    )
    if res.points:
        return res.points[0].payload.get("concept_name", code)
    return code
    
def build_input_context(final_title: str, final_code: str, top_k: int = 10) -> str:
    """
    Cria o contexto do prompt:
      • título do conceito
      • stem-codes candidatos (deduplicados)
      • opções de pós-coordenação (deduplicadas) contendo
        *obrigatoriamente* todos os códigos presentes em `final_code`.
    """

    # ── 1. Stem-codes candidatos ────────────────────────────────────────
    lines = [f"Clinical concept to map: <input>{final_title}</input>"]
    candidates = search_candidates(final_title)
    codes = [payload.get('code') for payload, score in candidates]
    # caso tenha correspondente com score > 0.8 (já verificado em search_candidates) e também caso algum dos codes estejam contidos em final_code
    if candidates and any(c[0]['code'] in final_code for c in candidates) and len(candidates) >= 2:
        lines.append("Relevant matched stem codes found:")
        for payload, score in candidates:
            lines.append(f"- {payload['code']} - {payload['concept_name']}")
    else:
        return "\n".join(lines)  # retorna apenas o título se não houver candidatos

    if needs_postcoord(final_code):
        # ── 2. Pós-coordenação sugerida ─────────────────────────────────────
        post_opts = fetch_postcoord_options(final_title, candidates)
    
        # a) deduplicar lista retornada
        seen_ext = set()
        dedup_post_opts = []
        for o in post_opts:
            c = o["code"].strip()
            if c not in seen_ext:
                seen_ext.add(c)
                dedup_post_opts.append(o)
    
        # b) garantir que TODO código presente em final_code conste na lista
        parts_in_final = re.split(r'[&/]', final_code)
        for part in parts_in_final[1:]:  # ignora o stem code principal
            part = part.strip()
            if part and part not in seen_ext:
                dedup_post_opts.append(
                    {"code": part, "title": get_title_by_code(part)}
                )
                seen_ext.add(part)
    
        # c) adicionar ao contexto (máx. 10 para não estourar prompt)
        if dedup_post_opts:
            lines.append("Post-coordination options:")
            for o in dedup_post_opts[:10]:
                lines.append(f"- {o['code']} ({o['title']})")

    return "\n".join(lines)



def build_scratchpad(
    code: str,
    map_type: str
):
    """
    Gera bloco <scratchpad> em inglês com raciocínio passo-a-passo para o mapeamento de um conceito ICD-11.
    Explica a escolha dos stem codes, extensões e clusters utilizados.
    """

    import re

    # Caso seja uma extension code não retorna nada
    if code.startswith('X') and '&' not in code and '/' not in code:
        return "<scratchpad>Extension codes do not require a scratchpad explanation.</scratchpad>"

    # 1. Divide o código em partes (stem codes e extensions), mantendo a ordem
    code_parts = re.split(r'[/&]', code)

    # 2. Busca no Qdrant apenas pelo filtro de código (não é uma busca semântica)
    res = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        with_payload=True,
        limit=len(code_parts) + 5,
        query_filter=Filter(
            must=[
                {"key": "code", "match": {"any": code_parts}},
                {"key": "name_type", "match": {"value": "fsn"}}
            ]
        ),
    )

    # 3. Inicialização do scratchpad (explicação em inglês)
    pad = []
    pad.append(
        "We are verifying the stem code that is most related to the concept. "
        "Ideally, it should not be more specific (i.e., contain elements not present in the original concept to be mapped), "
        "nor less specific (i.e., miss elements present in the concept to be mapped but not represented in the suggested stem code)."
    )

    # 4. Ordem dos stem codes (usado apenas para rotular na explicação)
    stem_code_order = ["primary", "secondary", "tertiary", "quaternary"]

    # 5. Flag para explicar pós-coordenação só uma vez
    needed_postcoordination = "&" in code or "/" in code

    # 6. Processa cada parte do código apenas UMA VEZ
    for idx, part in enumerate(code_parts):
        found = False  # Garante que só processa o primeiro match para cada part
        for r in res.points:
            if r.payload.get("code") == part:
                found = True
                chapter = r.payload.get('chapter', [])
                title = r.payload.get('concept_name', '')
                if idx == 0:  # Primeira parte = stem code principal
                    stem_code = part
                    # Explica capítulo e blocos
                    if chapter:
                        pad.append(
                            f"The closest chapter is <chapter>{chapter[0]['code']} - {chapter[0]['title']}</chapter>."
                        )
                    for idxb, block in enumerate(r.payload.get("blocks", [])):
                        code_part = block['code'].split('-')[-1] if '-' in block['code'] else block['code']
                        if idxb == 0:
                            pad.append(f"The nearest block is <blockL{idxb+1}>{code_part} - {block['title']}</blockL{idxb+1}>.")
                        else:
                            pad.append(f"I found another block within this that is related to the topic: <blockL{idxb+1}>{code_part} - {block['title']}</blockL{idxb+1}>.")
                    if stem_code_order:
                        pad.append(f"Selected {stem_code_order[0]} stem code: {stem_code} - {get_title_by_code(stem_code)}.")
                        stem_code_order = stem_code_order[1:]  # Remove o usado
                    else:
                        pad.append(f"Selected stem code: {stem_code} - {get_title_by_code(stem_code)}.")
                else:
                    # Só explica pós-coordenação uma vez
                    if needed_postcoordination:
                        has_required = r.payload.get("has_required_postcoordination", False)
                        pad.append(
                            "Evaluating whether the term requires post-coordination: according to ICD-11, this concept "
                            + ("REQUIRES" if has_required else "does NOT require")
                            + " post-coordination."
                        )
                        pad.append(
                            "Semantically, we check if the stem code found fully covers the concept to be mapped. "
                            "If not, we search for stem codes or extension codes that complement the specificity of the concept."
                        )
                        needed_postcoordination = False  # Explica só uma vez
                    # Descreve extensões ou stems adicionais
                    if r.payload.get("code_type") == "extension":
                        # Explica a dimensão trazida pela extensão, se disponível
                        parent_text = ""
                        parents = r.payload.get('parents', [])
                        parent_title = ""
                        if len(parents) >= 5:
                            parent_title = parents[-5]['title']
                        elif len(parents) >= 4:
                            parent_title = parents[-4]['title']
                        if parent_title:
                            parent_text = f"in order to include the dimension of {parent_title} to its meaning."
                        pad.append(
                            f"After analysis, it is necessary to add the extension \"{part} - {title}\" to the main stem code {parent_text}"
                        )
                    elif r.payload.get("code_type") == "stem":
                        if stem_code_order:
                            pad.append(f"After review, I believe a {stem_code_order[0]} stem code \"{part} - {title}\" is necessary to complement the meaning and specificity of the concept.")
                            stem_code_order = stem_code_order[1:]  # Remove usado
                        else:
                            pad.append(f"After review, I believe the stem code {part} is necessary to complement the concept.")
                break  # <-- Sai do loop interno após primeiro match!
        if not found:
            # Opcional: log ou debug, se algum part não foi encontrado
            pass

    # 7. Finalização: mensagem se pós-coordenação foi necessária ou não
    if len(code_parts) == 1 and not needs_postcoord(code_parts[0]):
        pad.append(
            "After analysis, the following stem code is sufficient to represent the concept."
        )
    else:
        pad.append(
            "By combining secondary codes with a slash “/” and extensions with “&”, I need to assess whether, after post-coordination, we have a final concept that is fully equivalent to the input concept."
        )

    # 8. Declara o tipo de mapeamento
    if map_type == "SAME-AS":
        pad.append(
            "The final concept covers all the specificity of the concept to be mapped, therefore here is the final code with the mapping type:"
        )
    elif map_type == "NARROWER-THAN":
        pad.append("The input concept is more specific than the final coded concept, therefore here is the final code with the mapping type:")
    elif map_type == "BROADER-THAN":
        pad.append("The input concept is less specific than the final coded concept, therefore here is the final code with the mapping type:")

    # 9. Retorna bloco formatado
    return "<scratchpad>\n" + "\n".join(pad) + "\n</scratchpad>"


def compute_weight(code, name_type, category):
    """Calcula o peso baseado no tipo de nome, origem e se há pós-coordenação no código."""
    base = 1.0

    # Penaliza sinônimo ou paráfrase
    if name_type == "synonym":
        base *= 0.7
    elif name_type == "paraphrase":
        base *= 0.5

    # Aumenta peso se for do CIEL
    if "ciel" in category.lower():
        base *= 1.2

    # Aumenta peso se tiver pós-coordenação
    if "&" in code or "/" in code:
        base *= 1.5

    return round(base, 2)


def get_embedding(text: str):
    """Gera embedding usando o modelo sentence-transformers/all-mpnet-base-v2."""
    embedding = model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
    return embedding.tolist()

In [27]:
from pprint import pprint

# amostra de conteúdo do icd11_vector_input.json
def load_vector_input(file_path: str):
    """Carrega o JSON vetorial de entrada."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

pprint(load_vector_input("icd11_vector_input.json")[:2])  # Exemplo de uso

[{'concept_name': 'Certain infectious or parasitic diseases',
  'metadata': {'blocks': [],
               'chapter': [{'code': '01',
                            'title': 'Certain infectious or parasitic '
                                     'diseases'}],
               'code': None,
               'code_type': 'foundation',
               'entity_id_residual': '1435254666',
               'has_required_postcoordination': False,
               'is_leaf': False,
               'leaf_related_codes': [],
               'name_type': 'fsn',
               'parents': [{'code': '',
                            'entity_id_residual': '455013390',
                            'title': 'ICD Category'},
                           {'code': '',
                            'entity_id_residual': '448895267',
                            'title': 'ICD Entity'}]}},
 {'concept_name': 'Gastroenteritis or colitis of infectious origin',
  'metadata': {'blocks': [],
               'chapter': [{'code': '01',
   

In [4]:
# 🎯 2. Definir instruções possíveis
instructions_stem = [
    "Provide the ICD-11 code for the clinical concept. Include extensions or secondary stem codes if required for full specificity.",
    "Find the ICD-11 code corresponding to the clinical concept. Use extensions or secondary stem codes if necessary.",
    "Map the clinical concept to its ICD-11 code. Add extensions or secondary stem codes if needed.",
    "Assign the ICD-11 code to the following concept, considering extensions or secondary stem codes where appropriate."
]

instructions_extension = [
    "Provide the ICD-11 extension code corresponding to the specified extension concept.",
    "Identify the ICD-11 extension code for the following extension concept.",
    "Find the corresponding ICD-11 extension code for this modifier."
]

#### 3. Main Loop

In [177]:
import pandas as pd

# ╭───────────────────────── 3. LOOP PRINCIPAL ─────────────────────────╮

# captura os dados do banco de dados dos mapeamentos icd que já temos pela tabela icd11_mms_en
query_icd_source_mappings = text("""
    SELECT code, title, is_leaf, class_kind, chapter_no FROM analytics.icd11_mms_en
""")
df_icd_source_mappings = pd.read_sql(query_icd_source_mappings, engine)
df_icd_source_mappings = df_icd_source_mappings[
    (df_icd_source_mappings['is_leaf']) &
    (df_icd_source_mappings['code'].notnull()) &
    (df_icd_source_mappings['class_kind'] == 'category')
].sample(frac=1).reset_index(drop=True)  # embaralha

# captura no banco de dados os mapeamentos no ciel que já temos
query_ciel_source_mappings = text("""
    SELECT concept_id, fsn, map_type, icd11_code
    FROM analytics.vw_ciel_diagnosis_n_findings_to_icd11
""")
df_ciel_source_mappings = pd.read_sql(query_ciel_source_mappings, engine)
df_ciel_source_mappings = df_ciel_source_mappings.sample(frac=1).reset_index(drop=True)  # embaralha

N_SAMPLES = 20  # ou outro valor

# Amostra de cada fonte
df_icd_sample = df_icd_source_mappings.sample(n=N_SAMPLES, random_state=42)
df_ciel_sample = df_ciel_source_mappings.sample(n=N_SAMPLES, random_state=42)

alpaca_entries = []

# Etapa 1: Processar mapeamentos icd11_mms_en
for _, row in tqdm(df_icd_source_mappings.iterrows(), total=len(df_icd_source_mappings), desc="Processando mapeamentos ICD-11"):
    code = row['code']
    title = str(row['title']).lstrip('-–— ').strip()
    chapter = row['chapter_no']

    # Verifica se o código é um código de extensão ou stem
    if code.startswith('X') and chapter == 'X':
        category = 'source:icdapi>extension'
        instruction = random.choice(instructions_extension)
    else:
        category = 'source:icdapi>same-as+stem'
        instruction = random.choice(instructions_stem)

    entry = {
        "instruction": instruction,
        "input": build_input_context(final_title=title, final_code=code),
        "output": f"""
        {build_scratchpad(
            code=code, map_type="SAME-AS"
        )}
        <map_type>SAME-AS</map_type> <code>{code}</code> <category>{category}</category>""",
        "category": category,
        "weight": compute_weight(code=code, name_type="fsn", category=category)
    }

    alpaca_entries.append(entry)

# Etapa 2: Processar mapeamentos CIEL
for _, row in tqdm(df_ciel_source_mappings.iterrows(), total=len(df_ciel_source_mappings), desc="Processando mapeamentos CIEL"):
    concept_id = row['concept_id']
    fsn = str(row['fsn']).lstrip('-–— ').strip()
    map_type = row['map_type']
    icd11_code = row['icd11_code']

    category = None
    instruction = random.choice(instructions_stem)

    if icd11_code.startswith('X'):
        category = 'source:ciel>extension'
        instruction = random.choice(instructions_extension)
    if '&' not in icd11_code and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem'

    elif icd11_code.count('&') == 1 and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem+extension'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem+extension'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem+extension'

    elif icd11_code.count('&') > 1 and '/' not in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+stem+extensions'
        elif map_type == "NARROWER-THAN":
            category = 'source:ciel>narrower-than+stem+extensions'
        elif map_type == "BROADER-THAN":
            category = 'source:ciel>broader-than+stem+extensions'

    elif '&' not in icd11_code and '/' in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+cluster'

    elif '&' in icd11_code and '/' in icd11_code:
        if map_type == "SAME-AS":
            category = 'source:ciel>same-as+cluster+extension'

    if category is None:
        continue

    entry = {
        "instruction": instruction,
        "input": build_input_context(final_title=fsn, final_code=icd11_code),
        "output": f"""{build_scratchpad(
            code=icd11_code, map_type=map_type
        )}\n<map_type>{map_type}</map_type> <code>{icd11_code}</code> <category>{category}</category>""",
        "category": category,
        "weight": compute_weight(code=icd11_code, name_type="fsn", category=category)
    }

    alpaca_entries.append(entry)

# ╭───────────────────────── 4. EXPORTAR ─────────────────────────╮
out_path = "../assets/icd11_alpaca_v2.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for e in alpaca_entries:
        json.dump(e, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Dataset v2 salvo em {out_path} com {len(alpaca_entries)} exemplos.")

Processando mapeamentos ICD-11: 100%|██████████| 30966/30966 [51:21<00:00, 10.05it/s]  
Processando mapeamentos CIEL: 100%|██████████| 24366/24366 [43:23<00:00,  9.36it/s]  


✅ Dataset v2 salvo em ../assets/icd11_alpaca_v2.jsonl com 54789 exemplos.


In [None]:
import json
import random
from collections import Counter

jsonl_path = "../assets/icd11_alpaca_v2.jsonl"

# Carrega o dataset completo
with open(jsonl_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

print(f"Total de exemplos: {len(data)}")

# Sorteia 5 amostras aleatórias sem repetição
sample_n = 40
random.seed(36)  # Reprodutibilidade (opcional)
sampled = random.sample(data, min(sample_n, len(data)))

for i, item in enumerate(sampled):
    print(f"\n--- Exemplo {i+1} ---")
    print(f"Category: {item.get('category')}")
    print(f"Instruction: {item.get('instruction')}")
    print(f"Input: {item.get('input')}")
    print(f"Output: {item.get('output')}")
    print(f"Weight: {item.get('weight')}")
    print("")
    print("-" * 40)
    print("")

# Distribuição por categoria
cat_counts = Counter(item['category'] for item in data)
print("\nDistribuição por categoria:")
for cat, count in cat_counts.items():
    print(f"{cat}: {count}")

In [62]:
# Debug: busca por concept e imprime code, concept_name e name_type, agrupando
concept = "Benign vascular neoplasms of infancy or childhood, unspecified"
top_k = 20

vector = get_embedding(concept)
res = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    query=vector,
    limit=top_k,
    with_payload=True,
    query_filter=Filter(
        must=[
            {"key": "code_type", "match": {"value": "stem"}}
        ]
    ),
)

from collections import defaultdict

# Agrupa resultados por código
grouped = defaultdict(list)
for i, r in enumerate(res.points):
    code = r.payload.get('code')
    name = r.payload.get('concept_name')
    name_type = r.payload.get('name_type')
    grouped[code].append((name, name_type))

print(f"Resultados agrupados por code para: {concept}\n")
for code, values in grouped.items():
    print(f"Code: {code} (total: {len(values)})")
    for idx, (name, name_type) in enumerate(values, 1):
        print(f"   {idx:02d}. {name_type}: {name}")
    print("-" * 40)

# Resumo rápido de quantos name_types por código
print("\nResumo por code:")
for code, values in grouped.items():
    nt_types = ", ".join(sorted(set(nt for _, nt in values)))
    print(f"- {code}: {len(values)} resultados ({nt_types})")

Resultados agrupados por code para: Benign vascular neoplasms of infancy or childhood, unspecified

Code: 2E81.2Z (total: 2)
   01. fsn: Benign vascular neoplasms of infancy or childhood, unspecified
   02. synonym: Benign vascular neoplasms of infancy or childhood
----------------------------------------
Code: 2E81.2 (total: 1)
   01. fsn: Benign vascular neoplasms of infancy or childhood
----------------------------------------
Code: 2E81.2Y (total: 1)
   01. fsn: Other specified benign vascular neoplasms of infancy or childhood
----------------------------------------
Code: 2E81.Z (total: 2)
   01. fsn: Benign vascular neoplasms, unspecified
   02. synonym: Benign vascular neoplasms
----------------------------------------
Code: 2E81 (total: 1)
   01. fsn: Benign vascular neoplasms
----------------------------------------
Code: 2E81.Y (total: 1)
   01. fsn: Other specified benign vascular neoplasms
----------------------------------------
Code: 2F2Y (total: 1)
   01. synonym: Benign

In [149]:
# Bloco de teste: verificar se “XT5R” está indexado no Qdrant

# 1) Importações (caso ainda não estejam no notebook)
from qdrant_client.models import Filter

# 2) Execução da consulta pontual para “XT5R”
res_xt5r = qdrant.query_points(
    collection_name=COLLECTION_NAME,
    with_payload=True,
    limit=1,
    query_filter=Filter(
        must=[
            {"key": "code", "match": {"value": "XT5R"}}
        ]
    ),
)

# 3) Exibir resultado
print("Encontrou XT5R no Qdrant?", bool(res_xt5r.points))
if res_xt5r.points:
    print("\nPayload retornado para ‘XT5R’:")
    print(res_xt5r.points[0].payload)

Encontrou XT5R no Qdrant? True

Payload retornado para ‘XT5R’:
{'concept_name': 'Acute', 'code': 'XT5R', 'entity_id_residual': '786106375', 'code_type': 'extension', 'name_type': 'fsn', 'is_leaf': True, 'chapter': [{'code': 'X', 'title': 'Extension Codes'}], 'blocks': [], 'parents': [{'code': '', 'title': 'Acute-Subacute-Chronic Scale Value', 'entity_id_residual': '943864406'}, {'code': '', 'title': 'Course', 'entity_id_residual': '2082570273'}, {'code': '', 'title': 'Course of the Condition', 'entity_id_residual': '1770621312'}, {'code': '', 'title': 'Temporality', 'entity_id_residual': '614922797'}, {'code': 'X', 'title': 'Extension Codes', 'entity_id_residual': '979408586'}, {'code': '', 'title': 'ICD Category', 'entity_id_residual': '455013390'}, {'code': '', 'title': 'ICD Entity', 'entity_id_residual': '448895267'}], 'has_required_postcoordination': False, 'leaf_related_codes': []}


#### 4. Testing (QA)

In [178]:
import json
from collections import Counter
from pathlib import Path

# Caminho para o arquivo
FILE_PATH = Path("icd11_alpaca_v2_augmented_paraphrases.jsonl")

# Inicializa contadores
total = 0
categories = Counter()

# Lê o arquivo linha a linha
with FILE_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            item = json.loads(line)
            category = item.get("category", "undefined")
            categories[category] += 1
            total += 1
        except json.JSONDecodeError as e:
            print(f"Erro na linha {total + 1}: {e}")

# Exibe resultado
print(f"\n📊 Total de linhas: {total}")
print("📂 Divisão por categoria:")
for cat, count in categories.most_common():
    print(f"  {cat}: {count}")


📊 Total de linhas: 168414
📂 Divisão por categoria:
  source:icdapi>synonym>same-as+stem: 82673
  source:icdapi>same-as+stem: 16019
  source:icdapi>extension: 14947
  source:ciel>paraphrase>narrower-than+stem: 10206
  source:ciel>narrower-than+stem: 9830
  source:ciel>paraphrase>same-as+stem: 5329
  source:ciel>same-as+stem: 5165
  source:icdapi>synonym>extension: 4987
  source:ciel>paraphrase>narrower-than+stem+extension: 4156
  source:ciel>narrower-than+stem+extension: 3420
  source:ciel>paraphrase>same-as+stem+extension: 3390
  source:ciel>same-as+stem+extension: 2866
  source:ciel>paraphrase>broader-than+stem: 1355
  source:ciel>broader-than+stem: 1280
  source:ciel>paraphrase>broader-than+stem+extension: 525
  source:ciel>broader-than+stem+extension: 438
  source:ciel>paraphrase>same-as+stem+extensions: 376
  source:ciel>paraphrase>same-as+cluster: 353
  source:ciel>same-as+cluster: 316
  source:ciel>same-as+stem+extensions: 282
  source:ciel>paraphrase>same-as+cluster+extension: 

In [179]:
import json
import random
from pathlib import Path

# ── 1. Parâmetros e caminhos ───────────────────────────────────────────
INPUT_FILE  = Path("icd11_alpaca_v2_augmented_paraphrases.jsonl")
OUTPUT_DIR  = Path(".")  # pode ajustar para outra pasta
SEED        = 1234       # para reprodutibilidade
TRAIN_PCT   = 0.80
VAL_PCT     = 0.10
TEST_PCT    = 0.10

random.seed(SEED)

# ── 2. Carrega e embaralha todas as linhas ────────────────────────────
all_lines = []
with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            # verifica se é um JSON válido
            json.loads(line)
            all_lines.append(line)
        except json.JSONDecodeError:
            # ignora eventuais linhas malformadas
            continue

random.shuffle(all_lines)

# ── 3. Calcula pontos de corte para as divisões ───────────────────────
N = len(all_lines)
n_train = int(N * TRAIN_PCT)
n_val   = int(N * VAL_PCT)
# n_test será o restante
n_test  = N - n_train - n_val

train_lines      = all_lines[:n_train]
validation_lines = all_lines[n_train : n_train + n_val]
test_lines       = all_lines[n_train + n_val :]

print(f"Total de linhas lidas: {N}")
print(f"  → Train:      {len(train_lines)}")
print(f"  → Validation: {len(validation_lines)}")
print(f"  → Test:       {len(test_lines)}")

# ── 4. Salva em três arquivos JSONL ────────────────────────────────────
def dump_jsonl(lines, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as out_f:
        for ln in lines:
            out_f.write(ln + "\n")

dump_jsonl(train_lines,      OUTPUT_DIR / "train.jsonl")
dump_jsonl(validation_lines, OUTPUT_DIR / "validation.jsonl")
dump_jsonl(test_lines,       OUTPUT_DIR / "test.jsonl")

print("Arquivos gerados em:", OUTPUT_DIR)

Total de linhas lidas: 168414
  → Train:      134731
  → Validation: 16841
  → Test:       16842
Arquivos gerados em: .
