In [1]:
import pandas as pd
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기

per = 1

train_data = pd.read_csv("/home/watercar99/project/MIMIC-III/Data Extract/Tabular Data/FINAL Data Extract/DATA/train_test_data/Final_train_data("+str(per)+"%)_down.csv")
test_data = pd.read_csv("/home/watercar99/project/MIMIC-III/Data Extract/Tabular Data/FINAL Data Extract/DATA/train_test_data/Final_test_data("+str(per)+"%)_down.csv")

train_data.drop(["SUBJECT_ID","HADM_ID", "DOA", "ETHNICITY", "TLOS", "LOS"], axis=1, inplace=True)
test_data.drop(["SUBJECT_ID","HADM_ID", "DOA",  "ETHNICITY", "TLOS", "LOS"], axis=1, inplace=True)

# train 데이터의 모드(Mode) 값 및 중앙값(Median) 계산
mode_values = train_data[['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']].mode().iloc[0]
median_values = train_data.drop(columns=['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']).median()

# train 데이터와 test 데이터의 결측치를 채움
train_data[['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']] = train_data[['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']].fillna(mode_values)
test_data[['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']] = test_data[['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color']].fillna(mode_values)

# 나머지 칼럼들의 결측치를 중앙값으로 채움
train_data = train_data.fillna(median_values)
test_data = test_data.fillna(median_values)

                
train_data = pd.get_dummies(train_data, columns=['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color'])
test_data = pd.get_dummies(test_data, columns=['GENDER', 'Ventilator', 'Anisocytosis', 'Macrocytes', 'Poikilocytosis', 'Bacteria', 'Bilirubin', 'Urine Appearance', 'Urine Color'])

# 결측치가 있는 칼럼을 확인하고 제거합니다.
train_data = train_data.dropna(axis=1)
test_data = test_data.dropna(axis=1)

In [3]:
# train_data와 test_data의 칼럼을 맞추기 위해 차집합을 계산
missing_columns_in_test = set(train_data.columns) - set(test_data.columns)
missing_columns_in_train = set(test_data.columns) - set(train_data.columns)

# test_data에 train_data의 더미 변수를 추가하고 0으로 채워주기
for col in missing_columns_in_test:
    test_data[col] = 0

# train_data에 test_data의 더미 변수를 추가하고 0으로 채워주기
for col in missing_columns_in_train:
    train_data[col] = 0
    
# Features와 Target 설정
X_train = train_data.drop('y', axis=1)
y_train = train_data['y']
X_test = test_data.drop('y', axis=1)

print("train shape\n",X_train.shape,"\n")
print("train shape\n",X_test.shape,"\n")

target = "y"
features = [f for f in train_data.columns if f not in [target]]

print("train value\n",train_data['y'].value_counts())
print("test value\n",test_data['y'].value_counts())

train shape
 (978, 98) 

train shape
 (457, 98) 

train value
 1    489
0    489
Name: y, dtype: int64
test value
 0    341
1    116
Name: y, dtype: int64


In [4]:
# 로지스틱 회귀 모델 학습
def objective(trial):
    c = trial.suggest_float("c", 1e-8, 10.0, log=True)
    
    model = LogisticRegression(C=c, random_state=42)
    
    # Cross Validation을 통한 평가 지표 계산
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    return scores.mean()

In [None]:
from sklearn.metrics import roc_auc_score

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
y_pred_list = []
model_list = []
auroc_list = []

for _ in range(10):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30, gc_after_trial=True)

    # 최적 하이퍼파라미터 확인
    best_c = study.best_params["c"]

    # 최적 모델 학습 및 평가
    best_model = LogisticRegression(C=best_c, random_state=42)
    best_model.fit(X_train, y_train)

    # Test 데이터로 모델 평가
    y_pred = best_model.predict(X_test)
    y_score = best_model.predict_proba(X_test)

    accuracy = accuracy_score(test_data['y'], y_pred)
    precision = precision_score(test_data['y'], y_pred)
    recall = recall_score(test_data['y'], y_pred)
    f1 = f1_score(test_data['y'], y_pred)
    auroc = roc_auc_score(test_data['y'], y_score[:,1])

    rounded_accuracy = round(accuracy * 100, 2)
    rounded_precision = round(precision * 100, 2)
    rounded_recall = round(recall * 100, 2)
    rounded_f1 = round(f1 * 100, 2)
    rounded_auroc = round(auroc * 100, 2)
    
    accuracy_list.append(rounded_accuracy)
    precision_list.append(rounded_precision)
    recall_list.append(rounded_recall)
    f1_list.append(rounded_f1)
    y_pred_list.append(y_pred)
    auroc_list.append(rounded_auroc)

    print("Test Accuracy:", rounded_accuracy)
    print("Precision:", rounded_precision)
    print("Recall:", rounded_recall)
    print("F1-score:", rounded_f1)
    print("AUROC:", rounded_auroc)

[32m[I 2023-10-07 02:11:52,668][0m A new study created in memory with name: no-name-fbfb3d80-dc03-4ac5-a45d-d84f748b5f3c[0m
[32m[I 2023-10-07 02:11:54,205][0m Trial 0 finished with value: 0.629434151643817 and parameters: {'c': 7.492303373262485}. Best is trial 0 with value: 0.629434151643817.[0m
[32m[I 2023-10-07 02:11:55,283][0m Trial 1 finished with value: 0.6315063152339307 and parameters: {'c': 3.342812165737714}. Best is trial 1 with value: 0.6315063152339307.[0m
[32m[I 2023-10-07 02:11:56,166][0m Trial 2 finished with value: 0.6051761858213471 and parameters: {'c': 9.392769400881427e-06}. Best is trial 1 with value: 0.6315063152339307.[0m
[32m[I 2023-10-07 02:11:58,143][0m Trial 3 finished with value: 0.6231841406063177 and parameters: {'c': 0.0025795838707744203}. Best is trial 1 with value: 0.6315063152339307.[0m
[32m[I 2023-10-07 02:11:59,349][0m Trial 4 finished with value: 0.6293349939416991 and parameters: {'c': 0.00010969083580106145}. Best is trial 1 wit

In [None]:
data = {
    'Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1 Score': f1_list,
    'AUROC':auroc_list,
    'Predictions': y_pred_list
    
}

# 딕셔너리를 DataFrame으로 변환
df = pd.DataFrame(data)
df

### DataFrame을 CSV 파일로 저장

In [None]:
df.to_csv('~/project/MIMIC-III/Model/Output/LR/LR_'+str(per)+'%_output.csv', index=False)

In [None]:
# 그리고 싶은 confusion matrix index
idx = 1 

cm = confusion_matrix(test_data['y'], y_pred_list[idx])

# Confusion Matrix 시각화
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
plt.figure(figsize=(6, 4))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.show()

# Classification Report 출력
print("Classification Report:")
print(classification_report(test_data['y'], y_pred))

# Precision-Recall Curve 계산 및 시각화
precision, recall, _ = precision_recall_curve(test_data['y'], best_model.predict_proba(X_test)[:, 1])
average_precision = auc(recall, precision)
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve (AP = {average_precision:.2f})')
plt.show()


In [None]:
# 특성 중요도를 얻기 위한 모델 또는 변수에 따라 조정이 필요할 수 있음
feature_importance = best_model.feature_importances_

# 상위 10개의 특성 중요도를 얻기 위해 argsort 사용
top_10_indices = feature_importance.argsort()[-10:][::-1]

# 상위 10개의 특성과 중요도 출력
for idx in top_10_indices:
    print(f"Feature: {X_test.columns[idx]}, Importance: {feature_importance[idx]}")