In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 데이터 불러오기

per = 50

train_data = pd.read_csv("~/project/MIMIC-III/Data/FINAL/Final Data_Split/train_data("+str(per)+"%)_down.csv")
test_data = pd.read_csv("~/project/MIMIC-III/Data/FINAL/Final Data_Split/test_data("+str(per)+"%)_down.csv")

train_data.drop(["SUBJECT_ID","HADM_ID", "DOA", "TLOS", "ETHNICITY"], axis=1, inplace=True)
test_data.drop(["SUBJECT_ID","HADM_ID", "DOA", "TLOS", "ETHNICITY"], axis=1, inplace=True)

train_data = pd.get_dummies(train_data, columns=["GENDER"])
test_data = pd.get_dummies(test_data, columns=["GENDER"])

In [None]:
# train_data와 test_data의 칼럼을 맞추기 위해 차집합을 계산
missing_columns_in_test = set(train_data.columns) - set(test_data.columns)
missing_columns_in_train = set(test_data.columns) - set(train_data.columns)

# test_data에 train_data의 더미 변수를 추가하고 0으로 채워주기
for col in missing_columns_in_test:
    test_data[col] = 0

# train_data에 test_data의 더미 변수를 추가하고 0으로 채워주기
for col in missing_columns_in_train:
    train_data[col] = 0
    
####################################################################
# Down Sampling
# train 데이터의 레이블 비율 확인
# train_labels = train_data['y']
# label_counts = train_labels.value_counts()

# # train 데이터의 레이블 비율이 1:1이 되도록 분할
# class_0_data = train_data[train_data['y'] == 0]
# class_1_data = train_data[train_data['y'] == 1]

# # 레이블이 0인 데이터 중에서 샘플 개수가 적은 만큼만 선택
# num_samples = min(label_counts[0], label_counts[1])
# class_0_data = class_0_data.sample(num_samples, random_state=42)

# # 레이블이 1인 데이터 중에서 샘플 개수가 적은 만큼만 선택
# class_1_data = class_1_data.sample(num_samples, random_state=42)

# # 선택한 데이터를 결합하여 최종 train 데이터 생성
# train_data = pd.concat([class_0_data, class_1_data])
####################################################################


# Features와 Target 설정
X_train = train_data.drop('y', axis=1)
y_train = train_data['y']
X_test = test_data.drop('y', axis=1)

print("train shape\n",X_train.shape,"\n")
print("train shape\n",X_test.shape,"\n")

target = "y"
features = [f for f in train_data.columns if f not in [target]]

print("train value\n",train_data['y'].value_counts())
print("test value\n",test_data['y'].value_counts())

In [None]:
def objective(trial):
    c = trial.suggest_float("c", 1e-8, 10.0, log=True)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    
    model = SVC(C=c, kernel=kernel, probability=True, decision_function_shape="ovo", random_state=42)
    
    # Cross Validation을 통한 평가 지표 계산
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='f1')
    return scores.mean()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import optuna
import random

# 초기화
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
auroc_list = []
y_pred_list = []

for _ in range(2):
    # Optuna 스터디 설정
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=2, gc_after_trial=True, n_jobs=-1)

    # 최적 하이퍼파라미터 확인
    best_trial = study.best_trial
    best_c = best_trial.params["c"]
    best_kernel = best_trial.params["kernel"]

    # 최적 모델 학습 및 평가
    best_model = SVC(C=best_c, kernel=best_kernel, random_state=42 , probability=True)
    best_model.fit(X_train, y_train)
    
    # Test 데이터로 모델 평가 및 예측
    y_pred = best_model.predict(X_test)
    y_score = best_model.predict_proba(X_test)
    accuracy = accuracy_score(test_data['y'], y_pred)
    precision = precision_score(test_data['y'], y_pred)
    recall = recall_score(test_data['y'], y_pred)
    f1 = f1_score(test_data['y'], y_pred)
    auroc = roc_auc_score(test_data['y'], y_score[:, 1])

    rounded_accuracy = round(accuracy * 100, 2)
    rounded_precision = round(precision * 100, 2)
    rounded_recall = round(recall * 100, 2)
    rounded_f1 = round(f1 * 100, 2)
    rounded_auroc = round(auroc * 100, 2)

    # 결과 및 예측값을 리스트에 추가
    accuracy_list.append(rounded_accuracy)
    precision_list.append(rounded_precision)
    recall_list.append(rounded_recall)
    f1_list.append(rounded_f1)
    auroc_list.append(rounded_auroc)
    y_pred_list.append(y_pred)

    print(f"Iteration {_ + 1} Results:")
    print("Best Trial Parameters:")
    print("c:", best_c)
    print("kernel:", best_kernel)
    print("Test Accuracy:", rounded_accuracy)
    print("Precision:", rounded_precision)
    print("Recall:", rounded_recall)
    print("F1-score:", rounded_f1)
    print("AUROC:", rounded_auroc)
    print("")

# 결과 출력
print("Mean Accuracy:", sum(accuracy_list) / len(accuracy_list))
print("Mean Precision:", sum(precision_list) / len(precision_list))
print("Mean Recall:", sum(recall_list) / len(recall_list))
print("Mean F1-score:", sum(f1_list) / len(f1_list))
print("Mean AUROC:", sum(auroc_list) / len(auroc_list))

# 예측값 리스트 출력
for i, y_pred in enumerate(y_pred_list):
    print(f"Iteration {i + 1} Predictions:")
    print(y_pred)


In [None]:

data = {
    'Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1 Score': f1_list,
    'AUROC':auroc_list,
    'Predictions': y_pred_list
    
}

# 딕셔너리를 DataFrame으로 변환
df = pd.DataFrame(data)
df

In [None]:
# DataFrame을 CSV 파일로 저장
df.to_csv('~/project/MIMIC-III/Model/Output/SVM(50%)_4.csv', index=False)  # index를 저장하지 않으려면 index=False로 설정


### DataFrame을 CSV 파일로 저장

In [None]:
# df1 = pd.read_csv("~/project/MIMIC-III/Model/Output/SVM(50%)_1.csv")
# df2 = pd.read_csv("~/project/MIMIC-III/Model/Output/SVM(50%)_2.csv")
# df3 = pd.read_csv("~/project/MIMIC-III/Model/Output/SVM(50%)_3.csv")
# df4 = pd.read_csv("~/project/MIMIC-III/Model/Output/SVM(50%)_4.csv")
# df5 = pd.read_csv("~/project/MIMIC-III/Model/Output/SVM(50%)_5.csv")
# merged_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
# merged_df


In [None]:
# merged_df.to_csv('~/project/MIMIC-III/Model/Output/SVM/SVM(50%).csv', index=False)  # index를 저장하지 않으려면 index=False로 설정
