In [1]:
import pandas as pd
import wfdb
import os

In [4]:
# Guardar caminho para os ficheiros do estudo "A large scale 12-lead electrocardiogram database for arrhythmia study"
base_path = "../data/WFDBRecords"

# Carregar paths dos registos
with open(os.path.join(base_path, "RECORDS.txt")) as f:
    record_paths = [line.strip() for line in f if line.strip()]

dados = []

# Percorrer todas as subpastas (ex: 01, 02, ..., 10)
for dir_raiz in os.listdir(base_path):
    print(f"Loading: {dir_raiz} ...")
    raiz_path = os.path.join(base_path, dir_raiz)

    if not os.path.isdir(raiz_path):
        continue

    # Percorrer cada sub-subpasta (ex: 010, 011, ...)
    for subpasta in os.listdir(raiz_path):
        sub_path = os.path.join(raiz_path, subpasta)

        records_file = os.path.join(sub_path, "RECORDS")
        if not os.path.exists(records_file):
            continue

        with open(records_file) as f:
            registos = [linha.strip() for linha in f if linha.strip()]

        for nome_registo in registos:
            full_path = os.path.join(sub_path, nome_registo)

            try:
                record = wfdb.rdrecord(full_path)
                sinal = record.p_signal[:, 0]  # canal 0
                dados.append({
                    "record_id": nome_registo,
                    "signal": sinal.tolist()
                })
            except Exception as e:
                print(f"Erro ao ler {full_path}: {e}")

# Criar DataFrame
df_ecg = pd.DataFrame(dados)
print(df_ecg.head())
print(f"Total de registos carregados: {len(df_ecg)}")

Loading: 01 ...
Erro ao ler WFDBRecords\01\019\JS01052: time data '/' does not match format '%d/%m/%Y'
Loading: 02 ...
Loading: 03 ...
Loading: 04 ...
Loading: 05 ...
Loading: 06 ...
Loading: 07 ...
Loading: 08 ...
Loading: 09 ...
Loading: 10 ...
Loading: 11 ...
Loading: 12 ...
Loading: 13 ...
Loading: 14 ...
Loading: 15 ...
Loading: 16 ...
Loading: 17 ...
Loading: 18 ...
Loading: 19 ...
Loading: 20 ...
Loading: 21 ...
Loading: 22 ...
Loading: 23 ...
Erro ao ler WFDBRecords\23\236\JS23074: list index out of range
Loading: 24 ...
Loading: 25 ...
Loading: 26 ...
Loading: 27 ...
Loading: 28 ...
Loading: 29 ...
Loading: 30 ...
Loading: 31 ...
Loading: 32 ...
Loading: 33 ...
Loading: 34 ...
Loading: 35 ...
Loading: 36 ...
Loading: 37 ...
Loading: 38 ...
Loading: 39 ...
Loading: 40 ...
Loading: 41 ...
Loading: 42 ...
Loading: 43 ...
Loading: 44 ...
Loading: 45 ...
Loading: 46 ...
Loading: ConditionNames_SNOMED-CT.csv ...
Loading: LICENSE.txt ...
Loading: RECORDS.txt ...
Loading: SHA256SUMS.t

In [5]:
# save pickle
df_ecg.to_pickle("data/processed/df_import.pkl")

In [7]:
# 1. Ler o mapeamento SNOMED → acrónimo
df_map = pd.read_csv("../data/WFDBRecords/ConditionNames_SNOMED-CT.csv")
snomed_to_acronym = dict(zip(df_map["Snomed_CT"].astype(str), df_map["Acronym Name"]))

# 2. Inicializar colunas
idades = []
sexos = []
diagnosticos_acronyms = []

# 3. Extrair info de cada header
# Pré-processamento: criar dicionário {record_id: path_para_o_arquivo}
hea_paths = {}
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.endswith(".hea"):
            record_id = file.replace(".hea", "")
            hea_paths[record_id] = os.path.join(root, record_id)

for idx, row in df_ecg.iterrows():
    record_id = row["record_id"]
    path = hea_paths.get(record_id)

    if path:
        try:
            header = wfdb.rdheader(path)
            com = {c.split(":")[0].strip(): c.split(":")[1].strip() for c in header.comments if ":" in c}

            idades.append(int(com.get("Age", -1)))
            sexos.append(com.get("Sex", None))

            dx_codes = com.get("Dx", "").split(",") if "Dx" in com else []
            acronyms = [snomed_to_acronym.get(code.strip(), f"UNKNOWN_{code.strip()}") for code in dx_codes]
            diagnosticos_acronyms.append(acronyms)

        except Exception as e:
            print(f"Erro ao ler header de {record_id}: {e}")
            idades.append(None)
            sexos.append(None)
            diagnosticos_acronyms.append([])
    else:
        idades.append(None)
        sexos.append(None)
        diagnosticos_acronyms.append([])

# 4. Adicionar ao DataFrame
df_ecg["age"] = idades
df_ecg["sex"] = sexos
df_ecg["diagnosticos"] = diagnosticos_acronyms

# Mostrar os primeiros
print(df_ecg[["record_id", "age", "sex", "diagnosticos"]].head())

print(df_ecg.describe())

Erro ao ler header de JS10867: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS11507: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS12543: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS12571: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS12576: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS12609: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13024: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13504: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13505: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13575: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13583: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13645: invalid literal for int() with base 10: 'NaN'
Erro ao ler header de JS13646: invalid literal for int() with base 10: 'NaN'

In [10]:
# Confirmar casos diagnostico unknown

unknown_cases = df_ecg[df_ecg["diagnosticos"].apply(lambda x: any("UNKNOWN" in d for d in x))]
print("Casos com diagnósticos desconhecidos:")
print(unknown_cases[["record_id", "diagnosticos"]])
# lista de codigos de diagnostico desconhecidos
unknown_codes = set()
for row in unknown_cases["diagnosticos"]:
    for code in row:
        if "UNKNOWN" in code:
            unknown_codes.add(code)

print("Códigos desconhecidos:")
print(unknown_codes)

Casos com diagnósticos desconhecidos:
      record_id                        diagnosticos
9       JS00011              [SB, UNKNOWN_55827005]
12      JS00014         [SB, ABI, UNKNOWN_55827005]
23      JS00025  [SB, LFBBB, UNKNOWN_55827005, PWC]
34      JS00038       [AFIB, UNKNOWN_55827005, VPB]
39      JS00043       [AFIB, IVB, UNKNOWN_55827005]
...         ...                                 ...
45145   JS45547                 [UNKNOWN_425856008]
45146   JS45548                 [UNKNOWN_425856008]
45147   JS45549                 [UNKNOWN_425856008]
45148   JS45550           [AVRT, UNKNOWN_425856008]
45149   JS45551                 [UNKNOWN_106068003]

[13152 rows x 2 columns]
Códigos desconhecidos:
{'UNKNOWN_164896001', 'UNKNOWN_418818005', 'UNKNOWN_426627000', 'UNKNOWN_445118002', 'UNKNOWN_251170000', 'UNKNOWN_251223006', 'UNKNOWN_6374002', 'UNKNOWN_733534002', 'UNKNOWN_251120003', 'UNKNOWN_445211001', 'UNKNOWN_67741000119109', 'UNKNOWN_713427006', 'UNKNOWN_251166008', 'UNKNOWN_426

In [11]:
# Guardar o df como pickle para poder carregar mais tarde pelo ficheiro
df_ecg.to_pickle("data/processed/df_data.pkl")