In [20]:
import pandas as pd

df = pd.read_pickle("../data/processed/df_peaks_fp.pkl")
print(df.head())
# Filtrar casos com menos de 4 picos R
print(f"Número de amostras pré filtragem: {len(df)}")
df = df[df["r_peaks"].apply(lambda x: len(x) >= 4)].copy()
print(f"Número de amostras após filtragem: {len(df)}")



  record_id                                             signal   age  \
0   JS00001  [-0.254, -0.254, -0.254, -0.254, -0.264, -0.27...  85.0   
1   JS00002  [-0.01, -0.024, -0.02, 0.01, 0.01, -0.029, -0....  59.0   
2   JS00004  [0.195, 0.195, 0.195, 0.195, 0.176, 0.166, 0.1...  66.0   
3   JS00005  [0.005, 0.005, -0.015, -0.005, -0.005, -0.049,...  73.0   
4   JS00006  [-0.029, -0.029, -0.029, -0.029, -0.034, -0.03...  46.0   

        diagnosticos                                            r_peaks  \
0  [AFIB, RBBB, TWC]                                  [345, 2167, 4063]   
1          [SB, TWC]    [541, 1112, 1680, 2279, 2853, 3449, 4013, 4604]   
2               [SB]  [404, 973, 1522, 2086, 2645, 3197, 3768, 4339,...   
3   [AF, STDD, STTC]  [160, 343, 527, 719, 907, 1091, 1275, 1457, 16...   
4               [SB]  [221, 759, 1287, 1797, 2340, 2877, 3404, 3918,...   

     rr_std     rr_cv     pnn50  male  
0  0.074000  0.019903  0.500000     1  
1  0.026766  0.023057  0.571429     

In [11]:
from scipy.stats import skew, kurtosis
import numpy as np

fs = 500  # sampling rate
rr_metrics = []

for r_peaks in df["r_peaks"]:
    rr = np.diff(r_peaks) / fs
    rr_std = np.std(rr)
    rr_mean = np.mean(rr)
    rr_cv = rr_std / rr_mean if rr_mean > 0 else None
    pnn50 = np.sum(np.abs(np.diff(rr)) > 0.05) / len(rr)

    rmssd = np.sqrt(np.mean(np.diff(rr) ** 2)) if len(rr) >= 2 else None
    skewness = skew(rr) if len(rr) >= 3 else None
    kurt = kurtosis(rr) if len(rr) >= 3 else None

    rr_metrics.append({
        "rr_std": rr_std,
        "rr_cv": rr_cv,
        "pnn50": pnn50,
        "rmssd": rmssd,
        "skewness": skewness,
        "kurtosis": kurt
    })

  skewness = skew(rr) if len(rr) >= 3 else None
  kurt = kurtosis(rr) if len(rr) >= 3 else None


In [12]:
# Adicionar ao df
df = pd.concat([df, pd.DataFrame(rr_metrics)], axis=1)
print(df.head())


  record_id                                             signal   age  \
1   JS00002  [-0.01, -0.024, -0.02, 0.01, 0.01, -0.029, -0....  59.0   
2   JS00004  [0.195, 0.195, 0.195, 0.195, 0.176, 0.166, 0.1...  66.0   
3   JS00005  [0.005, 0.005, -0.015, -0.005, -0.005, -0.049,...  73.0   
4   JS00006  [-0.029, -0.029, -0.029, -0.029, -0.034, -0.03...  46.0   
5   JS00007  [-0.063, -0.029, -0.044, -0.044, -0.024, -0.04...  80.0   

       diagnosticos                                            r_peaks  \
1         [SB, TWC]    [541, 1112, 1680, 2279, 2853, 3449, 4013, 4604]   
2              [SB]  [404, 973, 1522, 2086, 2645, 3197, 3768, 4339,...   
3  [AF, STDD, STTC]  [160, 343, 527, 719, 907, 1091, 1275, 1457, 16...   
4              [SB]  [221, 759, 1287, 1797, 2340, 2877, 3404, 3918,...   
5       [AFIB, TWC]  [35, 233, 618, 860, 1059, 1372, 1716, 2022, 24...   

     rr_std     rr_cv     pnn50  male    rr_std     rr_cv   pnn50     rmssd  \
1  0.026766  0.023057  0.571429   0.0  0.01

In [16]:
from sklearn.model_selection import train_test_split

print('df', df.shape)

df_ml = df[["rr_std", "rr_cv", "pnn50", "rmssd", "skewness", "kurtosis", "diagnosticos"]].copy()
df_ml["tem_afib"] = df_ml["diagnosticos"].apply(lambda x: "AFIB" in str(x)).astype(int)
df_ml = df_ml.dropna(subset=["rr_std", "rr_cv", "pnn50"])
print("Amostras com métricas válidas:", len(df_ml))

# Print casos com e sem AFIB
print("Casos com AFIB:", df_ml["tem_afib"].sum())
print("Casos sem AFIB:", len(df_ml) - df_ml["tem_afib"].sum())

X = df_ml[["rr_std", "rr_cv", "pnn50", "rmssd", "skewness", "kurtosis"]]
y = df_ml["tem_afib"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



df (44942, 15)
Amostras com métricas válidas: 39794
Casos com AFIB: 1633
Casos sem AFIB: 38161


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

modelos = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "SVM": SVC(class_weight='balanced', probability=True),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}
for nome, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    print(f"{nome}\n", classification_report(y_test, y_pred))

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values