In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score, precision_recall_curve, auc, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [22]:
base_path = "heg_sample_data.csv"

heg_meta = pd.read_csv(base_path)
data_class1 = heg_meta[heg_meta['outcome'] == 1]

#heg_meta = heg_meta.drop(columns=['Unnamed: 0'], errors='ignore')

In [23]:
# For Probable Calculate
data = heg_meta.copy()
drop_columns = ['delta_collect_timestamp_t-t1','delta_collect_timestamp_t1-t2',
                 'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4']

data.drop(columns=drop_columns, inplace=True)
#data.columns = data.columns.str.replace('[^A-Za-z0-9_]', '_', regex=True)

data

Unnamed: 0.1,Unnamed: 0,days_from_entrance,age,document.sexo,UTI,document.freq_cardiaca(t),document.freq_cardiaca(t-1),document.freq_cardiaca(t-2),document.freq_cardiaca(t-3),document.freq_cardiaca(t-4),...,delta_document.pa_sistolica_t3-t4,delta_document.sat_o2_t-t1,delta_document.sat_o2_t1-t2,delta_document.sat_o2_t2-t3,delta_document.sat_o2_t3-t4,delta_document.temperatura_t-t1,delta_document.temperatura_t1-t2,delta_document.temperatura_t2-t3,delta_document.temperatura_t3-t4,outcome
0,0,0.481160,-0.430375,1.0,0.0,-0.130331,0.337252,0.349323,0.368394,0.258938,...,0.107772,0.202479,0.194332,0.432788,0.012866,-0.144525,-0.024481,0.152412,0.007067,0.0
1,1,1.837695,0.230175,1.0,0.0,1.451289,1.601233,1.473310,1.576802,1.397064,...,-0.021933,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
2,2,0.226810,-0.719366,1.0,0.0,0.301020,0.403377,0.349323,0.368394,0.258938,...,0.139667,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
3,3,-0.366674,1.716413,1.0,0.0,0.684443,0.823398,0.701822,0.778084,0.820968,...,-0.101163,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
4,4,-0.366674,1.262285,0.0,0.0,0.492732,-0.537812,-0.415571,-0.812491,-0.569145,...,-0.021933,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647,13647,-0.281891,0.643019,1.0,0.0,-1.184744,-0.732271,-0.954112,-0.239590,-0.256581,...,0.029118,1.636497,-0.833785,0.424839,0.011498,0.584695,-0.350259,-0.725431,-0.005497,0.0
13648,13648,0.142026,1.097147,0.0,0.0,-0.274114,-0.148895,0.112743,1.411926,-0.256581,...,-0.676939,0.004101,0.829661,-0.001954,0.439924,-0.444251,-0.350259,0.012286,0.657215,0.0
13649,13649,-0.281891,0.106322,0.0,0.0,-0.513754,-1.218417,-0.165567,0.704134,0.812900,...,0.272103,0.412200,-0.002062,-2.135920,-0.034930,0.422230,0.526740,-0.214704,-0.985154,0.0
13650,13650,-0.281891,1.386138,1.0,0.0,-0.178259,0.094178,0.205513,0.043527,-0.165369,...,0.240936,-0.812096,-0.417923,-0.001954,0.011498,-1.310733,-0.240634,1.657963,-0.336854,0.0


In [24]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators = 500,
    learning_rate = 0.01,
    max_depth = 5,
    colsample_bytree = 0.8,
    random_state = 42
)

X = data.iloc[:,:-1]
y = data['outcome']

train_dataset = np.concatenate([X, y.values.reshape(-1, 1)], axis = 1)
train_dataset = pd.DataFrame(train_dataset, columns = X.columns.tolist() + ['outcome'])

train_data_indices = train_dataset[train_dataset['outcome'] == 0.0].index

In [25]:
train_index_file = data[['Unnamed: 0']].copy()
train_index_file

Unnamed: 0.1,Unnamed: 0
0,0
1,1
2,2
3,3
4,4
...,...
13647,13647
13648,13648
13649,13649
13650,13650


In [26]:
train_dataset['train_index'] = train_dataset.index
train_dataset

Unnamed: 0.1,Unnamed: 0,days_from_entrance,age,document.sexo,UTI,document.freq_cardiaca(t),document.freq_cardiaca(t-1),document.freq_cardiaca(t-2),document.freq_cardiaca(t-3),document.freq_cardiaca(t-4),...,delta_document.sat_o2_t-t1,delta_document.sat_o2_t1-t2,delta_document.sat_o2_t2-t3,delta_document.sat_o2_t3-t4,delta_document.temperatura_t-t1,delta_document.temperatura_t1-t2,delta_document.temperatura_t2-t3,delta_document.temperatura_t3-t4,outcome,train_index
0,0.0,0.481160,-0.430375,1.0,0.0,-0.130331,0.337252,0.349323,0.368394,0.258938,...,0.202479,0.194332,0.432788,0.012866,-0.144525,-0.024481,0.152412,0.007067,0.0,0
1,1.0,1.837695,0.230175,1.0,0.0,1.451289,1.601233,1.473310,1.576802,1.397064,...,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0,1
2,2.0,0.226810,-0.719366,1.0,0.0,0.301020,0.403377,0.349323,0.368394,0.258938,...,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0,2
3,3.0,-0.366674,1.716413,1.0,0.0,0.684443,0.823398,0.701822,0.778084,0.820968,...,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0,3
4,4.0,-0.366674,1.262285,0.0,0.0,0.492732,-0.537812,-0.415571,-0.812491,-0.569145,...,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647,13647.0,-0.281891,0.643019,1.0,0.0,-1.184744,-0.732271,-0.954112,-0.239590,-0.256581,...,1.636497,-0.833785,0.424839,0.011498,0.584695,-0.350259,-0.725431,-0.005497,0.0,13647
13648,13648.0,0.142026,1.097147,0.0,0.0,-0.274114,-0.148895,0.112743,1.411926,-0.256581,...,0.004101,0.829661,-0.001954,0.439924,-0.444251,-0.350259,0.012286,0.657215,0.0,13648
13649,13649.0,-0.281891,0.106322,0.0,0.0,-0.513754,-1.218417,-0.165567,0.704134,0.812900,...,0.412200,-0.002062,-2.135920,-0.034930,0.422230,0.526740,-0.214704,-0.985154,0.0,13649
13650,13650.0,-0.281891,1.386138,1.0,0.0,-0.178259,0.094178,0.205513,0.043527,-0.165369,...,-0.812096,-0.417923,-0.001954,0.011498,-1.310733,-0.240634,1.657963,-0.336854,0.0,13650


In [27]:
# 컬럼 이름을 문자열로 변환
X.columns = [str(col) for col in X.columns]

# 데이터 변환
X = X.to_numpy()
y = y.to_numpy()

# 모델 학습
xgb_model.fit(
    X, y,
    eval_set=[(X, y)],
    verbose = False
)

In [28]:
# For Probable Calculate
def get_highest_probability_points(model, data, original_data, target_class):
    # NumPy 배열로 변환하여 predict_proba 호출
    data_numpy = data.to_numpy()  # 데이터를 NumPy 배열로 변환
    proba = model.predict_proba(data_numpy)[:, target_class]

    # 확률에 따라 정렬
    highest_proba_indices = np.argsort(proba)[::-1]

    # 원본 데이터에서 해당 인덱스의 데이터 추출
    highest_proba_points = original_data.iloc[highest_proba_indices].copy()
    highest_proba_points['Probability'] = proba[highest_proba_indices]
    return highest_proba_points, highest_proba_indices


In [29]:
# Main Logic
train_data = data[data['outcome'] == 0.0]
train_data_others = train_data.drop(columns='outcome')

highest_df = pd.DataFrame()

# 호출 시 수정된 데이터를 사용
highest_df, highest_indices = get_highest_probability_points(
    xgb_model, train_data_others, train_data, target_class=0
)

# 인덱스 정렬 및 리셋
highest_df = highest_df.reset_index(drop=True)
highest_df['Index'] = train_data.index[highest_indices]

highest_df

Unnamed: 0.1,Unnamed: 0,days_from_entrance,age,document.sexo,UTI,document.freq_cardiaca(t),document.freq_cardiaca(t-1),document.freq_cardiaca(t-2),document.freq_cardiaca(t-3),document.freq_cardiaca(t-4),...,delta_document.sat_o2_t1-t2,delta_document.sat_o2_t2-t3,delta_document.sat_o2_t3-t4,delta_document.temperatura_t-t1,delta_document.temperatura_t1-t2,delta_document.temperatura_t2-t3,delta_document.temperatura_t3-t4,outcome,Probability,Index
0,7739,-0.281891,-1.132210,1.0,0.0,-0.369970,-0.878114,-0.683442,-0.812491,-0.569145,...,0.902141,-0.312279,0.303399,-0.119321,0.020588,0.029547,0.006474,0.0,0.994375,7739
1,12528,-0.366674,-1.132210,0.0,0.0,-0.369970,-0.829500,-0.583032,-0.522707,-0.912853,...,-0.002062,-0.001954,0.303399,-0.985802,0.855615,-0.895673,-0.173730,0.0,0.994249,12528
2,7551,-0.366674,-1.132210,1.0,0.0,0.157237,-0.586427,-0.415571,-0.812491,-0.569145,...,1.158744,-0.312279,0.303399,0.259765,0.700270,-0.003301,0.059689,0.0,0.994171,7551
3,13036,-0.197107,-1.173494,0.0,0.0,-0.657537,-0.343353,-1.278807,-1.041755,-0.484610,...,-0.833785,-0.001954,0.439924,1.776107,-0.021385,-0.157956,-0.557758,0.0,0.994104,13036
4,12233,-0.366674,-1.173494,1.0,0.0,-0.561681,-1.072573,-0.026412,0.002472,-0.135812,...,0.413800,0.019769,-0.003620,0.747160,-0.350259,0.229388,0.007067,0.0,0.994003,12233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11965,12896,-0.197107,0.808157,1.0,0.0,2.745341,1.552618,1.550678,2.025346,1.841289,...,-0.002062,0.851632,0.439924,-0.119321,0.307490,0.182529,0.160181,0.0,0.217863,12896
11966,3758,0.650727,0.973294,0.0,0.0,1.259577,0.483096,0.530208,0.373830,0.746748,...,0.194332,0.432788,0.012866,-0.144525,-0.024481,0.152412,0.007067,0.0,0.209338,3758
11967,10903,-0.197107,0.808157,0.0,0.0,0.540659,0.872013,0.762133,0.232272,0.199478,...,-0.002062,-0.001954,0.011498,-3.747711,3.322175,-3.108825,0.381085,0.0,0.201364,10903
11968,12416,-0.112324,1.138432,0.0,0.0,2.170207,2.767984,2.385608,1.647857,1.567654,...,-0.002062,-0.428747,0.011498,1.451176,-1.775383,-0.157956,-0.005497,0.0,0.185691,12416


In [30]:
# 0.8보다 큰 확률을 가진 데이터 필터링
lowest = highest_df[highest_df['Probability'] > 0.95]['Index']

low_idx = []
for idx in lowest:
    matching_indices = train_index_file[train_index_file['Unnamed: 0'] == idx]
    if not matching_indices.empty:
        low_idx.append(matching_indices['Unnamed: 0'].values[0])

temporal = train_data.loc[low_idx]

temporal['outcome'].value_counts()

outcome
0.0    7253
Name: count, dtype: int64

In [31]:
temporal.columns

Index(['Unnamed: 0', 'days_from_entrance', 'age', 'document.sexo', 'UTI',
       'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)',
       'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)',
       'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)',
       'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)',
       'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)',
       'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)',
       'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)',
       'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)',
       'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)',
       'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)',
       'document.pa_sistolica(t)', 'document.pa_sistolica(t-1)',
       'document.pa_sistolica(t-2)', 'document.pa_sistolica(t-3)',
       'document.pa_sistolica(t-4)', 'document.sat_o2(t)',
       'document.s

In [32]:
train_total = pd.concat([temporal,data_class1])
train_total = train_total.drop(columns=['Unnamed: 0'], errors='ignore')
train_total = train_total.sample(frac=1, random_state=42).reset_index(drop=True)

train_total.to_csv("XGBoost_undersampling.csv", index = False)