# 다음 안내를 반드시 따라주시기 바랍니다.
- 메일로 할당받은 GPU 번호를 반드시 입력바랍니다.
- 새로 만드시는 jupyter file에도 아래의 셀(GPU지정 코드, 메모리 제한 코드)를 복사하여 실행 바랍니다.

In [None]:
# gpu 지정

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3' #tensorflow log level 제한
os.environ['CUDA_VISIBLE_DEVICES']="메일로 할당받은 GPU 번호" # ← 여기에 메일로 할당받은 gpu 번호를 입력바랍니다
## 입력예시)   os.environ['CUDA_VISIBLE_DEVICES']="100"


# gpu 메모리 제한
import tensorflow as tf
gpus=tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try: tf.config.experimental.set_virtual_device_configuration(gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=16225)])
    except RuntimeError as e: print(e)


In [None]:
## 1. Data input

import pandas as pd
from pandas import DataFrame
import numpy as np
import glob
import pickle
from PIL import Image
from time import time

# Table
datathon_raw=pd.read_csv('dataset/datathon_train/datathon_train.csv', sep=',', encoding='utf-8-sig')

# Image - meta table
img_meta_raw=pd.read_csv('dataset/datathon_train/datathon_train_metadata.csv', sep=',')

# Image - file
img_raw=glob.glob('dataset/datathon_train/datathon_train_image/*.pkl')


In [None]:
## 2. Table 전처리

# OUTCOME이 .2인 값 인덱스 확인 및 제거
outcome_2=datathon_raw[datathon_raw['OUTCOME']==2].index
datathon=datathon_raw.drop(outcome_2, inplace=False)

# 컬럼 이름 및 값 수정
datathon=datathon.rename(columns={'초기산소\n요구도':'init_oxy',
                        'ESRD\n(HD여부)':'ESRD',
                        '고형암(C':'solidcancer',
                        '혈액암(D)':'bloodcancer',
                        '천식':'asthma',
                        'eGFR-Schwartz(소아)':'eGFR-Schwartz'})

datathon=datathon.replace({'20대' : '20_age', '30대' : '30_age',
                           '40대' : '40_age', '50대' : '50_age',
                           '60대' : '60_age', '70대' : '70_age',
                           '80대' : '80_age', '90세 이상' : '90_age'})

# One-Hot encoding
sex_feature=pd.get_dummies(datathon['sex'])
datathon=pd.concat([datathon,sex_feature],axis=1)

age_feature=pd.get_dummies(datathon['age'])
datathon=pd.concat([datathon,age_feature],axis=1)

# 제외할 column
datathon=datathon.drop(['sex','age','Myelocyte','Metamyelo','Band.neut.',
                        'Blast','Promyelo','Imm.cell','Imm.lympho','Imm.mono',
                        'AtypicalLc','Plas.cell','Other','Normoblast','LUC'],axis=1)


In [None]:
## 3. Image 전처리
start=time()
# 3.1. 대상자:사진 = 1:1 불러오기
img_meta=img_meta_raw.sort_values(by=["fid", "fid_idx"], ascending=[True, True]).reset_index()

# fid_idx가 큰 것 뽑기
img_meta=img_meta.drop_duplicates(['fid'], keep='last')

img_meta=img_meta.rename(columns={'fid':'id'})
img_table=pd.merge(img_meta, datathon, how='inner', on='id')
img_table["adress"]=img_table['file_name'].apply(lambda x:"/save/datathon_cxr_2.0/images/"+str(x))
img_id=img_table.loc[:, 'id']
img_adress=img_table.loc[:, "adress"]

# 3.2. Image size 조정
#기준치 설정 : 중앙값
start=time()
img_y, img_x=[],[]

for i in range(len(img_adress)):
    img=list(pickle.load(open(img_raw[i], 'rb')).shape)
    img_y=img_y+[img[0]]
    img_x=img_x+[img[1]]
    

img_y.sort()
img_x.sort()
median=int(len(img_y)/2)
med_y, med_x=img_y[median], img_x[median]
max_y, max_x=int(med_y*0.2), int(med_x*0.2)

# 기준치로 size
img_arr=np.zeros((len(img_adress), max_y, max_x))
for i in range(len(img_adress)):
    img=Image.fromarray(pickle.load(open(img_raw[i], 'rb')))
    img_s=img.resize((max_x, max_y), Image.LANCZOS)
    img_arr[i]=np.array(img_s)
print("동작 시간 : {} 분".format(round((time()-start)/60,2)))

In [None]:
## 4. Data split

from sklearn.model_selection import train_test_split

img_table=img_table[img_table.columns.difference(['index', 'StD', 'SrD', 'fid_idx', 'file_name', 'adress'])]
x = img_table[img_table.columns.difference(['OUTCOME'])]
y = img_table[['id','OUTCOME']]

# split
test_size=0.2

def split_image(data):
    idx=list(data.index)
    idx.sort()
    idx_img=np.zeros((len(idx), max_y, max_x))
    for i in range(len(idx)):
        idx_img[i]=img_arr[idx[i]]
    return idx_img

x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=test_size, random_state=123, stratify=y.loc[:,"OUTCOME"])
img_train=split_image(y_train)
img_test=split_image(y_test)

#null값 채우기 : train값의 평균
x_train=x_train.fillna(x_train.mean())
x_test=x_test.fillna(x_train.mean())

In [None]:
## 5. Image model
## 5.1 model : ResNet50
from tensorflow import keras
from keras.models import Model
from keras.layers import Dense, Input, Activation, GlobalAveragePooling2D, Flatten
from keras.applications.resnet_v2 import ResNet50V2

start=time()
model=ResNet50V2(include_top=False, weights=None, input_shape=(max_y, max_x, 1), classes=2)
x=model.output
x=GlobalAveragePooling2D()(x)
x=Flatten()(x)
x=Dense(64, activation='relu', name='relu')(x)
x=Dense(10, activation='softmax', name='softmax')(x)
model=Model(model.input, x)
print("동작 시간 : {} 분".format(round((time()-start)/60,2)))

# Model compile
start=time()
model.compile(optimizer='adam', # wight, bias를 업데이트 하는 방법 
            loss='sparse_categorical_crossentropy', # 오차측정방법
            metrics=['accuracy']) # 테스트평가법
print("동작 시간 : {} 분".format(round((time()-start)/60,2)))

In [None]:
# model summary
model.summary()

In [None]:
## 5. model - Image 
## 5.2. run

# model fitting
start=time()
model.fit(img_train, y_train.loc[:,"OUTCOME"], epochs=10, batch_size=32, verbose=1)
print("model fitting 동작 시간 : {} 분\n".format(round((time()-start)/60,2)))

# model evaluate
start=time()
loss, acc = model.evaluate(img_test, y_test.loc[:,"OUTCOME"], verbose=1)
print("model evaluate 동작 시간 : {} 분\n".format(round((time()-start)/60, 2)))

# model predict
start=time()
af_img_train=model.predict(img_train)  
af_img_train=pd.concat([x_train.reset_index(), pd.DataFrame(af_img_train)], axis=1)
af_img_train_rmid=af_img_train[af_img_train.columns.difference(['id'])]

af_img_test=model.predict(img_test)  
af_img_test=pd.concat([x_test.reset_index(), pd.DataFrame(af_img_test)], axis=1)
af_img_test_rmid=af_img_test[af_img_test.columns.difference(['id'])]
print("model predict 동작 시간 : {} 분\n".format(round((time()-start)/60, 2)))

In [None]:
## 6. Final classifier

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn import metrics

def pred_eval(y_test, y_pred, model):
    print("Model : ", model)
    print('정확도(Accuracy)  : ' , round(accuracy_score(y_test, y_pred), 5))
    print('정밀도(Precision) : ' , round(precision_score(y_test, y_pred), 5))
    print('재현율(Recall)    : ' , round(recall_score(y_test, y_pred), 5))
    print('F1 score         : ' , round(f1_score(y_test, y_pred), 5))
    print('AUC              : ' , round(roc_auc_score(y_test, y_pred), 5),"\n")


def rf_default(x_train, y_train, x_test, y_test):
    start=time()
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    y_pred=clf.predict(x_test)
    feature_importance = clf.feature_importances_
    pred_eval(y_test, y_pred, "Random Forest, default")
    print("model fit(Random Forest, default) : {} 분\n".format(round((time()-start)/60, 2)))
    return y_pred, feature_importance

def xg_default(x_train, y_train, x_test, y_test):
    start=time()
    clf = XGBClassifier()
    clf.fit(x_train, y_train)
    y_pred=clf.predict(x_test)
    feature_importance = clf.feature_importances_
    pred_eval(y_test, y_pred, "XGBoost, default")
    print("model fit(XGBoost, default) : {} 분\n".format(round((time()-start)/60, 2)))
    return y_pred, feature_importance


In [None]:
## 7. Result

import seaborn as sns
import matplotlib.pyplot as plt

def feature_impo(feature_importance, X_train, model):
    feature_imp=np.array(feature_importance)
    feature_name=np.array(X_train.columns)
    data={"feature_name": feature_name, "feature_importance":feature_imp}
    data=pd.DataFrame(data)
    data.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    plt.figure(figsize=(10,20))
    sns.barplot(x=data['feature_importance'], y=data["feature_name"])
    plt.title(model+" feature importance")
    plt.show()
    return 

def feature_impo_df(feature_importance, X_train):
    feature_importance = pd.Series(feature_importance, index=X_train.columns)
    feature_top = feature_importance.sort_values(ascending=False)[:len(X_train)]
    feature = pd.DataFrame(feature_top.reset_index())
    feature.columns = ['feature', 'importance']
    return feature

#confusion matrix
def confu(y_test, y_pred, model):
    result = pd.concat([y_test.loc[:,['OUTCOME']].reset_index(drop=True),pd.DataFrame(y_pred,columns=['예측값'])],axis=1)
    cm=pd.DataFrame(confusion_matrix(result['OUTCOME'],result['예측값']))
    sns.heatmap(cm, annot=True, fmt='g')
    plt.title(model+" Confusion Metrix")


In [None]:
## 결과 출력
## model result : image+table
X_train=af_img_train_rmid
Y_train=y_train.loc[:,'OUTCOME']
X_test=af_img_test_rmid
Y_test=y_test.loc[:, 'OUTCOME']

y_pred_rf, feature_importance_rf=rf_default(X_train, Y_train, X_test, Y_test)
y_pred_xg, feature_importance_xg=xg_default(X_train, Y_train, X_test, Y_test)


In [None]:
# feature importance
rfimpo=feature_impo(feature_importance_rf, X_test, "Random Forest")
xgimpo=feature_impo(feature_importance_xg, X_test, "XGBoost")

In [None]:
# feature importance_table
feature_impo_df(feature_importance_rf, X_test)

In [None]:
# feature importance_table
feature_impo_df(feature_importance_xg, X_test)

In [None]:
# confusion matrix
confu(DataFrame(Y_test), y_pred_rf, "Random Forest")

In [None]:
# confusion matrix
confu(DataFrame(Y_test), y_pred_xg, "XGBoost")