In [3]:
import os
import random
import pickle
import torch
import joblib
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from torch import optim
import pandas as pd
import numpy as np 
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler,Normalizer
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split
import sklearn.svm as svm
from sklearn.svm import SVC 
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification

#pd.set_option('display.max_columns', None)
train_data= pd.read_excel('train.xlsx',engine='openpyxl')
test_data = pd.read_excel('test.xlsx',engine='openpyxl')

In [4]:
train_data

Unnamed: 0,Patient number,Prediction day,성별,나이,입원일자,수술일자,퇴원일자,신장(cm),체중(kg),BMI,...,JP_Amy_Lt_POD#1,JP_Amy_Lt_POD#2,JP_Amy_Lt_POD#3,JP_Amy_Lt_POD#5,JP_Lt_color change,JP_Lip_Lt_POD#2,JP_Lip_Lt_POD#3,JP_Lip_Lt_POD#5,DSL,onset
0,1,,1,53,2019-01-01 00:00:00,2019-01-02 00:00:00,2019-01-08 00:00:00,177.8,54.1,17.11,...,*,*,*,*,*,*,*,*,*,*
1,2,,1,62,2019-01-01 00:00:00,2019-01-02 00:00:00,2019-01-08 00:00:00,173.9,72.6,24.01,...,*,*,*,*,*,*,*,*,*,*
2,3,,0,70,2019-01-02 00:00:00,2019-01-03 00:00:00,2019-01-09 00:00:00,157.2,73.4,29.7,...,*,*,*,*,*,*,*,*,*,*
3,4,,1,72,2019-01-03 00:00:00,2019-01-04 00:00:00,2019-01-23 00:00:00,168.9,71.5,25.06,...,*,*,*,*,*,*,*,*,*,*
4,5,,1,53,2019-01-06 00:00:00,2019-01-07 00:00:00,2019-01-15 00:00:00,165.1,76,27.88,...,*,*,*,*,*,*,*,*,*,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,225,,1,68,2020-12-16 00:00:00,2020-12-17 00:00:00,2020-12-24 00:00:00,164.9,66.9,24.6,...,*,*,*,*,*,*,*,*,*,*
541,226,,0,63,2020-12-22 00:00:00,2020-12-23 00:00:00,2020-12-29 00:00:00,149,47.7,21.49,...,*,*,*,*,*,*,*,*,*,*
542,227,,1,68,2020-12-23 00:00:00,2020-12-24 00:00:00,2020-12-30 00:00:00,166.1,71.4,25.88,...,*,*,*,*,*,*,*,*,*,*
543,228,,0,59,2020-12-28 00:00:00,2020-12-30 00:00:00,2021-01-08 00:00:00,154.7,47.8,19.97,...,*,*,*,*,*,*,*,*,*,*


In [5]:

def pre_processing(train_data):
    
    # 필요없는 column 제거
    ## train시 모두 *인 컬럼 :  '타장기원발암', 'JP_Lt_color change'
    ## Clipping , 수술 전 EUS 0.2 , 수술 전 PET-CT , TNM ,CEA(ng/mL) ,CA19-9(U/mL) cTNM, post ESD 의사쌤이 안넣어도 된다함.  
    ## 입원일자, 퇴원일자 없애고 , 수술일자만 카테고리 나눠서 쓸것임. 
    train_data=train_data.drop(['JP_Lt_color change','입원일자','퇴원일자','Patient number','타장기원발암','Clipping ',
                                '수술 전 EUS','수술 전 PET-CT','TNM','CEA(ng/mL)','CA19-9(U/mL)','cTNM',
                                'post ESD'],axis =1 ) 
    ## 'DSL'에 언젠가 걸릴지 예측하는 문제 -> onset은 언제걸렷는지 안중요해서 필요없을듯
    if 'onset' in list(train_data.columns):
      train_data=train_data.drop(['onset'],axis =1 )
    
    
    # 필요한 column 추가
    ## margin(p)', 'margin(d)'는 그 사람의 병변의 갯수만큼 값을 가지고 있음. 따라서 병변2,병변3 처럼 margin2,margin3 추가해줌. 
    train_data.rename(columns ={'margin(p)':'marginP','margin(d)':'marginD'},inplace = True)

    train_data['marginP2'] = train_data.marginP.str.split('/').str[1]
    train_data['marginP3'] = train_data.marginP.str.split('/').str[2]
    train_data['marginP2']=train_data['marginP2'].fillna('*')
    train_data['marginP3']=train_data['marginP3'].fillna('*')

    train_data['marginD2'] = train_data.marginD.str.split('/').str[1]
    train_data['marginD3'] = train_data.marginD.str.split('/').str[2]
    train_data['marginD2']=train_data['marginD2'].fillna('*')
    train_data['marginD3']=train_data['marginD3'].fillna('*')

    #결측치 너무 많은 row 제거(70% 이상) : DSL 안 걸린 사람 7명 없어짐  
    train_data = train_data.loc[(train_data =='*').mean(axis=1)<0.7]
    train_data=train_data.reset_index(drop=True)

    
    # '수술일자' -> 수술일자 날씨 카테고리로 변경(봄0,여름1,가을2,겨울3)
    for i in range(len(train_data)):
        if train_data.loc[i,'수술일자'] != '*' :
          train_data.loc[i,'수술일자']= int(train_data.loc[i,'수술일자'].strftime("%m"))
        else:
          train_data.loc[i,'수술일자']= 7

        if (train_data.loc[i,'수술일자'] ==3) or (train_data.loc[i,'수술일자'] ==4) or (train_data.loc[i,'수술일자'] ==5):
            train_data.loc[i,'수술일자'] = 0
        elif train_data.loc[i,'수술일자'] ==6 or (train_data.loc[i,'수술일자'] ==7) or (train_data.loc[i,'수술일자'] ==8):
            train_data.loc[i,'수술일자'] = 1

        elif train_data.loc[i,'수술일자'] ==9 or (train_data.loc[i,'수술일자'] ==10) or (train_data.loc[i,'수술일자'] ==11):
            train_data.loc[i,'수술일자'] = 2

        elif train_data.loc[i,'수술일자'] ==12 or (train_data.loc[i,'수술일자'] ==1) or (train_data.loc[i,'수술일자'] ==2):
            train_data.loc[i,'수술일자'] = 3
        
        # 위암 병소수 : 병변의 갯수임. 따라서 0은 오타고, 병변의 갯수 만큼(대부분 1) 채워넣으면 됨->  0인 애들,결측치 * 인애들 다 1로 바꿈 
        if (train_data.loc[i,'위암병소수'] == '*') or (train_data.loc[i,'위암병소수'] == 0) : 
            train_data.loc[i,'위암병소수'] = 1
        
        # '병변1 AGC 분류' 와 '병변1 EGC 분류' 둘 중 하나는 꼭 있어야 되는 feature임. (육안상 진행성 위암인지 조기위암인지) 
        #   -> 둘다 결측치라면 현미경으로 관측한 정보 'AGC 분류'와 'EGC 분류'로 채워넣음.  
        if (train_data.loc[i,'병변1 AGC 분류'] == '*') and (train_data.loc[i,'병변1 EGC 분류'] == '*') :
            train_data.loc[i,'병변1 AGC 분류'] = train_data.loc[i,'AGC 분류'] 
            train_data.loc[i,'병변1 EGC 분류'] = train_data.loc[i,'EGC 분류']

        # 'tubular 위치' 컬럼과 '병변1 tubular'의 카테고리를 같게 통일해줌 . (ex. 0: upper)
        if (train_data.loc[i,'tubular 위치'] == 1) :
            train_data.loc[i,'tubular 위치'] = 0
        elif (train_data.loc[i,'tubular 위치'] == 2) :
          train_data.loc[i,'tubular 위치'] = 1
        elif (train_data.loc[i,'tubular 위치'] == 3) or (train_data.loc[i,'tubular 위치'] == 4) or (train_data.loc[i,'tubular 위치'] == 5) or (train_data.loc[i,'tubular 위치'] == 6) :
          train_data.loc[i,'tubular 위치'] = 2
        elif (train_data.loc[i,'tubular 위치'] == 7) :
          train_data.loc[i,'tubular 위치'] = 3
        elif len(str(train_data.loc[i,'tubular 위치'])) != 1 :
          train_data.loc[i,'tubular 위치'] = 2 

        # 현미경으로 관측한 정보 'tubular 위치'와 'circular 위치'로  '병변1 tubular'와 '병변1 circular'를 채워줌 
        if (train_data.loc[i,'병변1 tubular'] == '*') :
          train_data.loc[i,'병변1 tubular'] = train_data.loc[i,'tubular 위치']
        else : train_data.loc[i,'병변1 tubular'] = int(train_data.loc[i,'병변1 tubular'])
        if (train_data.loc[i,'병변1 circular'] == '*') :
          train_data.loc[i,'병변1 circular'] = train_data.loc[i,'circular 위치']

        # 위에 marginP2,marginD2,marginD3,marginD3는 업데이트 했었음. marginP 업데이트(값 1개만 남게)
        if type(train_data.loc[i,'marginP']) == str:
          if train_data.loc[i,'marginP'] != '*' :
            train_data.loc[i,'marginP']= float(train_data.loc[i,'marginP'].split('/')[0])
        if type(train_data.loc[i,'marginD']) == str:
          if train_data.loc[i,'marginD'] != '*' :
            train_data.loc[i,'marginD']= float(train_data.loc[i,'marginD'].split('/')[0])

        # '*'이 결측치가 아니라 '없음'이라는 카테고리인 경울 -> 이미 있는 '없음 카테고리' 0으로 바꿔줌  
        if (train_data.loc[i,'가족암병력'] == '*') :
          train_data.loc[i,'가족암병력'] = 0
        if (train_data.loc[i,'DSL'] == '*') :
          train_data.loc[i,'DSL'] = 0   

        # 0이 아니라 결측값인 애들 -> '*'로 바꿔줌    
        if (train_data.loc[i,'AGC 분류'] == 0) :
          train_data.loc[i,'AGC 분류'] = '*'
        if (train_data.loc[i,'EGC 분류'] == 0) :
          train_data.loc[i,'EGC 분류'] = '*'   
    
    ################# Numerical data #################
    # Numerical data imputation -> 평균으로 대체
    Ordinal_data = ['ASA score','LN dissection ','위암병소수' ] 
    Numerical_data = ['나이','신장(cm)', '체중(kg)','BMI','수술시간(min)','출혈량(mL)','GAS OUT','SD start',
                      '병변1 크기(Cm)','marginP','marginD','구득 림프절수','전이 림프절수'] +Ordinal_data
    
    for numeric in Numerical_data :
      train_data[numeric] =  train_data[numeric].replace('*',np.NaN)
      train_data[numeric] = pd.to_numeric(train_data[numeric])
      train_nu_mean=train_data[numeric].mean()
      train_data[numeric]= train_data[numeric].fillna(train_data[numeric].mean())
    ## bmi 는 (체중*10000) / (신장**2)
    train_data['BMI'] = (train_data['체중(kg)']*10000)/(train_data['신장(cm)']**2)

    # numerical data 인데*이 결측치가 아니라 측정 안한거: '*'을 0으로 바꾸면 됨. 

    star_is_zero_numeric = [ '병변2 크기(Cm)', '병변3 크기(Cm)','marginP2','marginD2','marginP3','marginD3',
                               'WBC_Pre', 'WBC_Post', 'WBC_POD#1', 'WBC_POD#2', 'WBC_POD#3', 'WBC_POD#5', 'WBC_POD#7',
                               'Hb_Pre', 'Hb_Post', 'Hb_POD#1', 'Hb_POD#2', 'Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7', 
                               'AST_Pre', 'AST_Post', 'AST_POD#1', 'AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7', 
                               'ALT_Pre', 'ALT_Post', 'ALT_POD#1', 'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5', 'ALT_POD#7', 
                               'CRP_Pre', 'CRP_Post', 'CRP_POD#1', 'CRP_POD#2', 'CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
                               'JP_Amy_Rt_POD#1', 'JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5', 
                               'JP_Rt_color change', 'JP_Lip_Rt_POD#1', 'JP_Lip_Rt_POD#2', 'JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
                               'JP_Amy_Lt_POD#1', 'JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5', 
                               'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']
    for i in range(len(train_data)):
      for value in star_is_zero_numeric :
        if (train_data.loc[i,value] == '*') :
          train_data.loc[i,value] = 0      
    
    ################# Categorical data #################
    
    ## '*'을 그대로 놔둬야 하는 애들 -> '*'이라는 category를 주고 imputation 안함. 
    star_is_category = ['AGC 분류','EGC 분류','tubular 위치','circular 위치','병변1 AGC 분류','병변1 EGC 분류',
                    '병변1 tubular', '병변1 circular','병변2 AGC 분류','병변2 EGC 분류',
                    '병변2 tubular', '병변2 circular','병변3 AGC 분류','병변3 EGC 분류',
                    '병변3 tubular', '병변3 circular' ]


    ## imputation 해야 되는 애들 

    ###  결측치가 10 % 미만인 컬럼들 -> 빈도수 가장 많은 걸로
    train_data = train_data.rename(columns={'ESD/EMR': 'ESDEMR','정규/응급': '정규응급'})
    Categorical_data_under10 =['성별','수술일자','smoking','가족암병력','HTN','DM','Dyslipidemia',
                       '심혈관질환','뇌혈관질환','신장질환','호흡기질환','기타 기저질환','복부 수술력','복부수술력 중 위관련',
                       '기타 수술력','수술 전   심초음파','수술 전   폐기능검사','수술 전    흉부CT검사',
                       'ESDEMR','정규응급','수술명','개복전환','문합법','합병절제                   ',
                       'adhesion','invasion','radicality','혈관변이',
                       '수혈']
    binbin =[]
    for bin in Categorical_data_under10:
      binbin.append(train_data[bin].value_counts().idxmax())

    for i in range(len(train_data)):
      for bin in Categorical_data_under10:
        if train_data.loc[i,bin] == '*' :
            train_data.loc[i,bin] = train_data[bin].value_counts().idxmax()
    
    ### 결측치가 10 % 이상인 컬럼들 -> random sample imputation :원래의 분포를 유지하도록 결측치를 imputation  
    Categorical_data_over10 =['depth','Stage','WHO classification','WHO 세포 분화도',"Lauren's classification",
                            'Ming classification','lymphatics invasion','vascular invasion','perineural invasion',
                            'additional findings']    
    for category10 in Categorical_data_over10 :
      train_data['_random'] = train_data[category10]
      train_data.loc[train_data[category10]!='*'][category10]
      temp = (train_data.loc[train_data[category10]!='*'][category10].sample(  (train_data[category10]=='*').sum()  ,replace =True ))
      temp.index = train_data[lambda x:  (x[category10]=='*')].index # index 부여 
      train_data.loc[(train_data[category10]=='*'), '_random'] = temp
      train_data[category10] = train_data['_random']
      train_data=train_data.drop(['_random'],axis =1 )

    ## Onehotencoding
    CATEGROCAL = star_is_category+Categorical_data_under10+Categorical_data_over10
    train_data[CATEGROCAL]= train_data[CATEGROCAL].astype(str)
    train_data= pd.get_dummies(train_data, columns = CATEGROCAL)
    
    # EGC 분류,circular 위치,가족암병력, WHO classification, WHO 세포분화도,Lauren's classification,Ming classification 처럼
    # 한 사람이 값을 2개 씩 가질 수 있는 컬럼 -> 원핫 인코딩하면 '가족암병력_2,3' 이런 컬럼이 추가됨. 이거 다시 나눠줌.  
    remo =[]
    for i in range(len(train_data)):
      for col in train_data.columns:
        if (',' in col) :
          remo.append(col)
          if (train_data.loc[i,col] == 1) :
            splitted_col = col.split(',')
            train_data.loc[i,splitted_col[0]] = 1
            for j in range(1,len(splitted_col)):
              train_data.loc[i,(splitted_col[0].split('_'))[0]+'_'+splitted_col[j] ] = 1
        if ('/' in col) :
          remo.append(col)
          if (train_data.loc[i,col] == 1) :
            splitted_col = col.split('/')
            train_data.loc[i,splitted_col[0]] = 1     
            for j in range(1,len(splitted_col)):
              train_data.loc[i,(splitted_col[0].split('_'))[0]+'_'+splitted_col[j] ] = 1
    a = list(set(remo))
    train_data= train_data.drop(a,axis=1)    
    
    train_data = train_data.fillna(0)
    train_data = train_data.astype('float')

    return train_data,temp,train_nu_mean,binbin
 

In [6]:
def preprocess_test(test_data):
  # 필요없는 column 제거
  ## test시 모두 *인 컬럼 :  '타장기원발암', 'JP_Lt_color change'
  ## Clipping , 수술 전 EUS 0.2 , 수술 전 PET-CT , TNM ,CEA(ng/mL) ,CA19-9(U/mL) cTNM, post ESD 의사쌤이 안넣어도 된다함.  
  ## 입원일자, 퇴원일자 없애고 , 수술일자만 카테고리 나눠서 쓸것임. 
  test_data=test_data.drop(['JP_Lt_color change','입원일자','퇴원일자','Patient number','타장기원발암','Clipping ',
                              '수술 전 EUS','수술 전 PET-CT','TNM','CEA(ng/mL)','CA19-9(U/mL)','cTNM',
                              'post ESD'],axis =1 ) 
  ## 'DSL'에 언젠가 걸릴지 예측하는 문제 -> onset은 언제걸렷는지 안중요해서 필요없을듯
  if 'onset' in list(test_data.columns):
    test_data=test_data.drop(['onset'],axis =1 )


  # 필요한 column 추가
  ## margin(p)', 'margin(d)'는 그 사람의 병변의 갯수만큼 값을 가지고 있음. 따라서 병변2,병변3 처럼 margin2,margin3 추가해줌. 
  test_data.rename(columns ={'margin(p)':'marginP','margin(d)':'marginD'},inplace = True)

  test_data['marginP2'] = test_data.marginP.str.split('/').str[1]
  test_data['marginP3'] = test_data.marginP.str.split('/').str[2]
  test_data['marginP2']=test_data['marginP2'].fillna('*')
  test_data['marginP3']=test_data['marginP3'].fillna('*')

  test_data['marginD2'] = test_data.marginD.str.split('/').str[1]
  test_data['marginD3'] = test_data.marginD.str.split('/').str[2]
  test_data['marginD2']=test_data['marginD2'].fillna('*')
  test_data['marginD3']=test_data['marginD3'].fillna('*')

  #결측치 너무 많은 row 제거(70% 이상) : DSL 안 걸린 사람 7명 없어짐  
  test_data = test_data.loc[(test_data =='*').mean(axis=1)<0.7]
  test_data=test_data.reset_index(drop=True)


  # '수술일자' -> 수술일자 날씨 카테고리로 변경(봄0,여름1,가을2,겨울3)
  for i in range(len(test_data)):
      if test_data.loc[i,'수술일자'] != '*' :
        test_data.loc[i,'수술일자']= int(test_data.loc[i,'수술일자'].strftime("%m"))
      else:
        test_data.loc[i,'수술일자']= 7

      if (test_data.loc[i,'수술일자'] ==3) or (test_data.loc[i,'수술일자'] ==4) or (test_data.loc[i,'수술일자'] ==5):
          test_data.loc[i,'수술일자'] = 0
      elif test_data.loc[i,'수술일자'] ==6 or (test_data.loc[i,'수술일자'] ==7) or (test_data.loc[i,'수술일자'] ==8):
          test_data.loc[i,'수술일자'] = 1

      elif test_data.loc[i,'수술일자'] ==9 or (test_data.loc[i,'수술일자'] ==10) or (test_data.loc[i,'수술일자'] ==11):
          test_data.loc[i,'수술일자'] = 2

      elif test_data.loc[i,'수술일자'] ==12 or (test_data.loc[i,'수술일자'] ==1) or (test_data.loc[i,'수술일자'] ==2):
          test_data.loc[i,'수술일자'] = 3
      
      # 위암 병소수 : 병변의 갯수임. 따라서 0은 오타고, 병변의 갯수 만큼(대부분 1) 채워넣으면 됨->  0인 애들,결측치 * 인애들 다 1로 바꿈 
      if (test_data.loc[i,'위암병소수'] == '*') or (test_data.loc[i,'위암병소수'] == 0) : 
          test_data.loc[i,'위암병소수'] = 1
      
      # '병변1 AGC 분류' 와 '병변1 EGC 분류' 둘 중 하나는 꼭 있어야 되는 feature임. (육안상 진행성 위암인지 조기위암인지) 
      #   -> 둘다 결측치라면 현미경으로 관측한 정보 'AGC 분류'와 'EGC 분류'로 채워넣음.  
      if (test_data.loc[i,'병변1 AGC 분류'] == '*') and (test_data.loc[i,'병변1 EGC 분류'] == '*') :
          test_data.loc[i,'병변1 AGC 분류'] = test_data.loc[i,'AGC 분류'] 
          test_data.loc[i,'병변1 EGC 분류'] = test_data.loc[i,'EGC 분류']

      # 'tubular 위치' 컬럼과 '병변1 tubular'의 카테고리를 같게 통일해줌 . (ex. 0: upper)
      if (test_data.loc[i,'tubular 위치'] == 1) :
          test_data.loc[i,'tubular 위치'] = 0
      elif (test_data.loc[i,'tubular 위치'] == 2) :
        test_data.loc[i,'tubular 위치'] = 1
      elif (test_data.loc[i,'tubular 위치'] == 3) or (test_data.loc[i,'tubular 위치'] == 4) or (test_data.loc[i,'tubular 위치'] == 5) or (test_data.loc[i,'tubular 위치'] == 6) :
        test_data.loc[i,'tubular 위치'] = 2
      elif (test_data.loc[i,'tubular 위치'] == 7) :
        test_data.loc[i,'tubular 위치'] = 3
      elif len(str(test_data.loc[i,'tubular 위치'])) != 1 :
        test_data.loc[i,'tubular 위치'] = 2 

      # 현미경으로 관측한 정보 'tubular 위치'와 'circular 위치'로  '병변1 tubular'와 '병변1 circular'를 채워줌 
      if (test_data.loc[i,'병변1 tubular'] == '*') :
        test_data.loc[i,'병변1 tubular'] = test_data.loc[i,'tubular 위치']
      else : test_data.loc[i,'병변1 tubular'] = int(test_data.loc[i,'병변1 tubular'])
      if (test_data.loc[i,'병변1 circular'] == '*') :
        test_data.loc[i,'병변1 circular'] = test_data.loc[i,'circular 위치']

      # 위에 marginP2,marginD2,marginD3,marginD3는 업데이트 했었음. marginP 업데이트(값 1개만 남게)
      if type(test_data.loc[i,'marginP']) == str:
        if test_data.loc[i,'marginP'] != '*' :
          test_data.loc[i,'marginP']= float(test_data.loc[i,'marginP'].split('/')[0])
      if type(test_data.loc[i,'marginD']) == str:
        if test_data.loc[i,'marginD'] != '*' :
          test_data.loc[i,'marginD']= float(test_data.loc[i,'marginD'].split('/')[0])

      # '*'이 결측치가 아니라 '없음'이라는 카테고리인 경울 -> 이미 있는 '없음 카테고리' 0으로 바꿔줌  
      if (test_data.loc[i,'가족암병력'] == '*') :
        test_data.loc[i,'가족암병력'] = 0
      if (test_data.loc[i,'DSL'] == '*') :
        test_data.loc[i,'DSL'] = 0   

      # 0이 아니라 결측값인 애들 -> '*'로 바꿔줌    
      if (test_data.loc[i,'AGC 분류'] == 0) :
        test_data.loc[i,'AGC 분류'] = '*'
      if (test_data.loc[i,'EGC 분류'] == 0) :
        test_data.loc[i,'EGC 분류'] = '*'   

  ################# Numerical data #################
  # Numerical data imputation -> 평균으로 대체
  Ordinal_data = ['ASA score','LN dissection ','위암병소수' ] 
  Numerical_data = ['나이','신장(cm)', '체중(kg)','BMI','수술시간(min)','출혈량(mL)','GAS OUT','SD start',
                    '병변1 크기(Cm)','marginP','marginD','구득 림프절수','전이 림프절수'] +Ordinal_data

  for numeric in Numerical_data :
    test_data[numeric] =  test_data[numeric].replace('*',np.NaN)
    test_data[numeric] = pd.to_numeric(test_data[numeric])
    
    test_data[numeric]= test_data[numeric].fillna(train_nu_mean)
  ## bmi 는 (체중*10000) / (신장**2)
  test_data['BMI'] = (test_data['체중(kg)']*10000)/(test_data['신장(cm)']**2)

  # numerical data 인데*이 결측치가 아니라 측정 안한거: '*'을 0으로 바꾸면 됨. 

  star_is_zero_numeric = [ '병변2 크기(Cm)', '병변3 크기(Cm)','marginP2','marginD2','marginP3','marginD3',
                              'WBC_Pre', 'WBC_Post', 'WBC_POD#1', 'WBC_POD#2', 'WBC_POD#3', 'WBC_POD#5', 'WBC_POD#7',
                              'Hb_Pre', 'Hb_Post', 'Hb_POD#1', 'Hb_POD#2', 'Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7', 
                              'AST_Pre', 'AST_Post', 'AST_POD#1', 'AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7', 
                              'ALT_Pre', 'ALT_Post', 'ALT_POD#1', 'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5', 'ALT_POD#7', 
                              'CRP_Pre', 'CRP_Post', 'CRP_POD#1', 'CRP_POD#2', 'CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
                              'JP_Amy_Rt_POD#1', 'JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5', 
                              'JP_Rt_color change', 'JP_Lip_Rt_POD#1', 'JP_Lip_Rt_POD#2', 'JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
                              'JP_Amy_Lt_POD#1', 'JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5', 
                              'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']
  for i in range(len(test_data)):
    for value in star_is_zero_numeric :
      if (test_data.loc[i,value] == '*') :
        test_data.loc[i,value] = 0      

  ################# Categorical data #################

  ## '*'을 그대로 놔둬야 하는 애들 -> '*'이라는 category를 주고 imputation 안함. 
  star_is_category = ['AGC 분류','EGC 분류','tubular 위치','circular 위치','병변1 AGC 분류','병변1 EGC 분류',
                  '병변1 tubular', '병변1 circular','병변2 AGC 분류','병변2 EGC 분류',
                  '병변2 tubular', '병변2 circular','병변3 AGC 분류','병변3 EGC 분류',
                  '병변3 tubular', '병변3 circular' ]


  ## imputation 해야 되는 애들 

  ###  결측치가 10 % 미만인 컬럼들 -> 빈도수 가장 많은 걸로
  test_data = test_data.rename(columns={'ESD/EMR': 'ESDEMR','정규/응급': '정규응급'})
  Categorical_data_under10 =['성별','수술일자','smoking','가족암병력','HTN','DM','Dyslipidemia',
                      '심혈관질환','뇌혈관질환','신장질환','호흡기질환','기타 기저질환','복부 수술력','복부수술력 중 위관련',
                      '기타 수술력','수술 전   심초음파','수술 전   폐기능검사','수술 전    흉부CT검사',
                      'ESDEMR','정규응급','수술명','개복전환','문합법','합병절제                   ',
                      'adhesion','invasion','radicality','혈관변이',
                      '수혈']
  for i in range(len(test_data)):
    for j,bin in enumerate(Categorical_data_under10):
      if test_data.loc[i,bin] == '*' :
          test_data.loc[i,bin] = binbin[j]

  ### 결측치가 10 % 이상인 컬럼들 -> random sample imputation :원래의 분포를 유지하도록 결측치를 imputation  
  Categorical_data_over10 =['depth','Stage','WHO classification','WHO 세포 분화도',"Lauren's classification",
                          'Ming classification','lymphatics invasion','vascular invasion','perineural invasion',
                          'additional findings']    
  for category10 in Categorical_data_over10 :
    test_data['_random'] = test_data[category10]
    test_data.loc[test_data[category10]!='*'][category10]
    
    test_data.loc[(test_data[category10]=='*'), '_random'] = temp
    test_data[category10] = test_data['_random']
    test_data=test_data.drop(['_random'],axis =1 )

  test_data = test_data.fillna(0)

  ## Onehotencoding
  CATEGROCAL = star_is_category+Categorical_data_under10+Categorical_data_over10
  test_data[CATEGROCAL]= test_data[CATEGROCAL].astype(str)
  test_data= pd.get_dummies(test_data, columns = CATEGROCAL)

  # EGC 분류,circular 위치,가족암병력, WHO classification, WHO 세포분화도,Lauren's classification,Ming classification 처럼
  # 한 사람이 값을 2개 씩 가질 수 있는 컬럼 -> 원핫 인코딩하면 '가족암병력_2,3' 이런 컬럼이 추가됨. 이거 다시 나눠줌.  
  remo =[]
  for i in range(len(test_data)):
    for col in test_data.columns:
      if (',' in col) :
        remo.append(col)
        if (test_data.loc[i,col] == 1) :
          splitted_col = col.split(',')
          test_data.loc[i,splitted_col[0]] = 1
          for j in range(1,len(splitted_col)):
            test_data.loc[i,(splitted_col[0].split('_'))[0]+'_'+splitted_col[j] ] = 1
      if ('/' in col) :
        remo.append(col)
        if (test_data.loc[i,col] == 1) :
          splitted_col = col.split('/')
          test_data.loc[i,splitted_col[0]] = 1     
          for j in range(1,len(splitted_col)):
            test_data.loc[i,(splitted_col[0].split('_'))[0]+'_'+splitted_col[j] ] = 1
  a = list(set(remo))
  test_data= test_data.drop(a,axis=1)    

  test_data = test_data.fillna(0)
  test_data = test_data.astype('float')

  return test_data

In [7]:
mask = 0
def columns_process(df1,df2):
  
  train_col = df1.columns
  test_col = df2.columns

  # train에 없는 컬럼
  train_no =[]
  for col in test_col :
    if col not in train_col :
      train_no.append(col)

  # test에 없는 컬럼    
  test_no =[]
  for col in train_col :
    if col not in test_col :
      test_no.append(col)

  df1[train_no]= mask
  df2[test_no] =mask

  df2 = df2[df1.columns]
  return df1, df2

def compute_ndcg(y_prob, y_true, k=10):
    # K=10, K=50, K=100
    assert len(y_prob) >= k
    assert len(y_true) >= k

    relevance = y_true == 1
    n_target = relevance.sum()
    rank = np.argsort(y_prob)[::-1]

    dcg = [relevance[rank[i]] / (math.log2(i + 2)) for i in range(k)]
    idcg = [(i < n_target) / (math.log2(i + 2)) for i in range(k)]

    return sum(dcg) / sum(idcg)

def prediction_day_augmentation(DATA) : 
  train_0 =DATA.copy()
  
  train_0['Prediction day'] = 0
  
  train_0[['WBC_POD#1', 'WBC_POD#2', 'WBC_POD#3','WBC_POD#5', 'WBC_POD#7', 
          'Hb_POD#1', 'Hb_POD#2','Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7', 
          'AST_POD#1','AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
        'ALT_POD#1', 'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5','ALT_POD#7', 
        'CRP_POD#1', 'CRP_POD#2','CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
        'JP_Amy_Rt_POD#1','JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',
        'JP_Rt_color change', 
        'JP_Lip_Rt_POD#1', 'JP_Lip_Rt_POD#2','JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
        'JP_Amy_Lt_POD#1','JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
        'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] = mask
  
  train_1 =DATA.copy()
  
  train_1['Prediction day'] = 1
  train_1[['WBC_POD#2', 'WBC_POD#3','WBC_POD#5', 'WBC_POD#7', 
          'Hb_POD#2','Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7', 
          'AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
        'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5','ALT_POD#7', 
        'CRP_POD#2','CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
        'JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',
        
        'JP_Lip_Rt_POD#2','JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
        'JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
        'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] =mask

  train_2 =DATA.copy()
  
  train_2['Prediction day'] = 2
  train_2[[ 'WBC_POD#3','WBC_POD#5', 'WBC_POD#7', 
        'Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7', 
          'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
          'ALT_POD#3', 'ALT_POD#5','ALT_POD#7',  
        'CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
          'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',

        'JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5',
        'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
        'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] = mask
  train_3 =DATA.copy()
  
  train_3['Prediction day'] = 3
  train_3[[ 'WBC_POD#5', 'WBC_POD#7', 
          'Hb_POD#5', 'Hb_POD#7', 
          'AST_POD#5', 'AST_POD#7',
          'ALT_POD#5','ALT_POD#7',  
          'CRP_POD#5', 'CRP_POD#7', 
          'JP_Amy_Rt_POD#5',
        'JP_Rt_color change',
        'JP_Lip_Rt_POD#5',
        'JP_Amy_Lt_POD#5',
        'JP_Lip_Lt_POD#5']] = mask
  train_5 =DATA.copy()
  
  train_5['Prediction day'] = 5
  train_5[[  'WBC_POD#7', 
          'Hb_POD#7', 
          'AST_POD#7',
          'ALT_POD#7',  
          'CRP_POD#7']] = mask
  for i in range(len(train_5)):
    
    if train_5.loc[i,'JP_Rt_color change'] == 6 :

      train_5.loc[i,'JP_Rt_color change'] = mask
  
  train_7 =DATA.copy()
  
  train_7['Prediction day'] = 7
  
  DATA_AUG = pd.concat([train_0,train_1,train_2,train_3,train_5,train_7],ignore_index=True)
  
  return DATA_AUG

class CustomDataset(Dataset):
    def __init__(self):
        
        self.X_train = X_train
        self.y_train = list(y_train)
        self.length = len(X_train)

    def __getitem__(self, index):
        x = torch.FloatTensor(np.array(self.X_train[index]))
        y = torch.FloatTensor(np.array(self.y_train[index]))
        #print(x.shape)
        #print(y.shape)
        return x, y

    def __len__(self):
        return self.length



class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(X_val.shape[1], 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 32)
        self.layer_out = nn.Linear(32, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        self.batchnorm3 = nn.BatchNorm1d(32)
        self.Sigmoid = nn.Sigmoid()
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        x= self.Sigmoid(x)
        return x

In [8]:
def basic_test(k) :
  gb = GradientBoostingClassifier(random_state =0)
  mlp = MLPClassifier(hidden_layer_sizes=(100,100), 
                        activation='tanh', solver='lbfgs',
                        batch_size='auto', learning_rate='adaptive', alpha=0.0001,max_iter=1000)
  svm_model =svm.SVC(kernel = 'rbf',C=8,gamma =0.1)
  rf = RandomForestClassifier(n_estimators=1000, random_state=0)
  xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1,max_depth=3)

  mlpp = MLPClassifier(hidden_layer_sizes=(10,10), activation='relu',solver='adam', 
                      alpha=0.01, batch_size=64,learning_rate_init=0.01, max_iter=100)

  clf = BaggingClassifier(base_estimator=mlpp, n_estimators=30, random_state=0)

  #models = [gb,mlp,svm_model,rf,xgb_wrapper,clf]
  #model_name = ["gb","mlp","svm_model","rf","xgb_wrapper","clf"]
  models = [gb,mlp,svm_model,rf,xgb_wrapper]
  model_name = ["gb","mlp","svm_model","rf","xgb_wrapper"]

  for i in range(len(models)):
    models[i].fit(X_train,y_train)
    y_pred = models[i].predict(X_val)
    
    print(" ")
    print(model_name[i],": {:.5f}".format(np.mean(y_pred == y_val))) # 예측 정확도
    cf = confusion_matrix(y_val, y_pred)
    print(cf)
    print("---------")
    joblib.dump(models[i], model_name[i]+f'_{k}.pkl')
    

In [9]:
def onset_masking(data_not_masked) :
    data=data_not_masked.copy()
    train_data_onset_masking=train_data.drop(['JP_Lt_color change','입원일자','퇴원일자','Patient number','타장기원발암','Clipping ',
                                    '수술 전 EUS','수술 전 PET-CT','TNM','CEA(ng/mL)','CA19-9(U/mL)','cTNM',
                                    'post ESD'],axis =1 )  
    train_data_onset_masking = train_data_onset_masking.loc[(train_data_onset_masking =='*').mean(axis=1)<0.7]
    train_data_onset_masking=train_data_onset_masking.reset_index(drop=True)
    list_a =[0,1,2,3,4,5,6,7,8,9,10,11,12]
    for i in range(len(train_data_onset_masking)):
        if train_data_onset_masking.loc[i,'onset']== '*':
            train_data_onset_masking.loc[i,'onset']=random.choice(list_a)
    train_data_onset_masking['Prediction day']= train_data_onset_masking['onset']-1

    data['Prediction day']=train_data_onset_masking['Prediction day']
    
    for i in range(len(data)):
        if data.loc[i,'Prediction day'] <=0:
            data.loc[i,'Prediction day'] =0
            data.loc[i,['WBC_POD#1', 'WBC_POD#2', 'WBC_POD#3','WBC_POD#5', 'WBC_POD#7',
                              'Hb_POD#1', 'Hb_POD#2','Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7',
                              'AST_POD#1','AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
                              'ALT_POD#1', 'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5','ALT_POD#7', 
                              'CRP_POD#1', 'CRP_POD#2','CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
                              'JP_Amy_Rt_POD#1','JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',
                              'JP_Lip_Rt_POD#1', 'JP_Lip_Rt_POD#2','JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
                              'JP_Amy_Lt_POD#1','JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
                              'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] = 0

        elif data.loc[i,'Prediction day'] == 1:
            data.loc[i,['WBC_POD#2', 'WBC_POD#3','WBC_POD#5', 'WBC_POD#7',
                              'Hb_POD#2','Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7',
                              'AST_POD#2', 'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
                              'ALT_POD#2', 'ALT_POD#3', 'ALT_POD#5','ALT_POD#7', 
                              'CRP_POD#2','CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
                              'JP_Amy_Rt_POD#2', 'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',
                              'JP_Lip_Rt_POD#2','JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
                              'JP_Amy_Lt_POD#2', 'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
                              'JP_Lip_Lt_POD#2', 'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] = 0
        elif data.loc[i,'Prediction day'] == 2 :
            data.loc[i,['WBC_POD#3','WBC_POD#5', 'WBC_POD#7',
                              'Hb_POD#3', 'Hb_POD#5', 'Hb_POD#7',
                              'AST_POD#3', 'AST_POD#5', 'AST_POD#7',
                              'ALT_POD#3', 'ALT_POD#5','ALT_POD#7', 
                              'CRP_POD#3', 'CRP_POD#5', 'CRP_POD#7', 
                              'JP_Amy_Rt_POD#3', 'JP_Amy_Rt_POD#5',
                              'JP_Lip_Rt_POD#3', 'JP_Lip_Rt_POD#5', 
                              'JP_Amy_Lt_POD#3', 'JP_Amy_Lt_POD#5',
                              'JP_Lip_Lt_POD#3', 'JP_Lip_Lt_POD#5']] = 0
        elif (data.loc[i,'Prediction day'] == 3) or (data.loc[i,'Prediction day'] == 4):
            data.loc[i,['WBC_POD#5', 'WBC_POD#7',
                              'Hb_POD#5', 'Hb_POD#7',
                              'AST_POD#5', 'AST_POD#7',
                              'ALT_POD#5','ALT_POD#7', 
                              'CRP_POD#5', 'CRP_POD#7', 
                              'JP_Amy_Rt_POD#5',
                              'JP_Lip_Rt_POD#5', 
                              'JP_Amy_Lt_POD#5',
                              'JP_Lip_Lt_POD#5']] = 0

        elif (data.loc[i,'Prediction day'] == 5) or (data.loc[i,'Prediction day'] == 6):
            data.loc[i,['WBC_POD#7',
                              'Hb_POD#7',
                              'AST_POD#7',
                              'ALT_POD#7', 
                              'CRP_POD#7']] = 0
        elif data.loc[i,'Prediction day'] >=7:
            data.loc[i,'Prediction day'] =7

        if data.loc[i,'JP_Rt_color change'] == 4 : 
            if (data.loc[i,'Prediction day'] < 4) :
              data.loc[i,'JP_Rt_color change'] = 0
        elif data.loc[i,'JP_Rt_color change'] == 6 : 
            if (data.loc[i,'Prediction day'] < 6) :
              data.loc[i,'JP_Rt_color change'] = 0
    
    return data


In [10]:
#######################
def train_test(k):
  train_dataset = CustomDataset()
  train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = BinaryClassification().to(device)
  criterion = nn.BCELoss().to(device)
  optimizer = optim.SGD(model.parameters(), lr=0.0001)

  model.train()
  cost_list = []
  ppp=0
  best_acc =0
  best_score = -1
  min_cost = 100
  for epoch in range(2000):
      cost = 0.0
      
      for x, y in train_dataloader:
          print(x.shape)
          #print(y.shape)
          x = x.squeeze().to(device)
          #print("ddd",x.shape)
          y = y.to(device)

          output = model(x).squeeze()
          #print(output)
          #print(output.shape)
          #print(y.shape)
          loss = criterion(output, y.squeeze())

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          #print("dd")
          cost += loss

      cost = cost / len(train_dataloader)
      if (epoch + 1) % 100 == 0:
          print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
      
      with torch.no_grad():
          model.eval()
          inputs = torch.FloatTensor(
              [np.array(X_val)]
          ).to(device)

          outputs = model(inputs.squeeze()).squeeze()
      
      acc  = np.mean(list(torch.round(outputs)) == y_val)
      score = sum([compute_ndcg(np.array(outputs), y_val, i) for i in [10, 50, 100]])
      
      if (score > best_score) and (min_cost >cost) and (best_acc < acc ):
            best_score = score
            min_cost =cost
            best_acc = acc
            torch.save(model.state_dict(), f'Newfold_{k}.pt')
  model = BinaryClassification()
  path = f'Newfold_{k}.pt'
  model.load_state_dict(torch.load(path))
  with torch.no_grad():
          model.eval()
          inputs = torch.FloatTensor(
              [np.array(X_val)]
          ).to(device)

          outputs = model(inputs.squeeze()).squeeze()          
  cf = confusion_matrix(y_val, list(torch.round(outputs)))

  print("MY MODEL accuracy: {:.5f}".format(np.mean(list(torch.round(outputs)) == y_val))) # 예측 정확도
  print(cf)
  
  return outputs

train_data_origin,temp,train_nu_mean,binbin= pre_processing(train_data)
test_data = preprocess_test(test_data)

train_data_origin, test_data = columns_process(train_data_origin,test_data)

SEED = 14
data = train_data_origin.drop("DSL", axis=1)
y = train_data_origin['DSL']
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

y_prob_list =[]
y_val_list =[]
scores =[]
for k, (train_index, val_index) in enumerate(skf.split(data, y)): # 5번
  print(k,"-fold")
  X_train = prediction_day_augmentation(train_data_origin.iloc[train_index,:].reset_index(drop= True))
  y_train = X_train["DSL"]
  X_train = X_train.drop("DSL",axis=1)

  X_val = onset_masking(train_data_origin).iloc[val_index,:].reset_index(drop= True)
  y_val = X_val["DSL"]
  X_val = X_val.drop("DSL",axis=1)

  test_data_xdsl = test_data.drop("DSL", axis=1)

  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_val  = scaler.transform(X_val)
  X_test  = scaler.transform(test_data_xdsl)
  
  basic_test(k)
  
  y_prob = train_test(k)

  score = sum([compute_ndcg(np.array(y_prob), y_val, i) for i in [10, 50, 100]])
  scores.append(score)
  #print(score)
  print("-------------------------------------------------")
  print(" ")
  y_prob_list.append(np.array(y_prob))
  y_val_list.append(y_val)

print("MY MODEL 최종 SCORE는" ,scores)

0 -fold
 
gb : 0.97222
[[105   0]
 [  3   0]]
---------
 
mlp : 0.97222
[[105   0]
 [  3   0]]
---------
 
svm_model : 0.97222
[[105   0]
 [  3   0]]
---------
 
rf : 0.97222
[[105   0]
 [  3   0]]
---------
 
xgb_wrapper : 0.97222
[[105   0]
 [  3   0]]
---------
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Siz



torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size([64, 307])
torch.Size

KeyboardInterrupt: 

In [None]:
fold_0 = np.array(pd.DataFrame({'y_true' : np.array(y_val_list[0]),'y_prob' : y_prob_list[0]})).tolist()
fold_1 = np.array(pd.DataFrame({'y_true' : np.array(y_val_list[1]),'y_prob' : y_prob_list[1]})).tolist()
fold_2 = np.array(pd.DataFrame({'y_true' : np.array(y_val_list[2]),'y_prob' : y_prob_list[2]})).tolist()
fold_3 = np.array(pd.DataFrame({'y_true' : np.array(y_val_list[3]),'y_prob' : y_prob_list[3]})).tolist()
fold_4 = np.array(pd.DataFrame({'y_true' : np.array(y_val_list[4]),'y_prob' : y_prob_list[4]})).tolist()
save_filename = 'fold_0.txt'
www = open(save_filename, 'w')
for k in fold_0 :
    www.write(str(k).strip("[""]") + '\n')
www.close()

save_filename = 'fold_1.txt'
www = open(save_filename, 'w')
for k in fold_1 :
    www.write(str(k).strip("[""]") + '\n')
www.close()

save_filename = 'fold_2.txt'
www = open(save_filename, 'w')
for k in fold_2 :
    www.write(str(k).strip("[""]") + '\n')
www.close()

save_filename = 'fold_3.txt'
www = open(save_filename, 'w')
for k in fold_3 :
    www.write(str(k).strip("[""]") + '\n')
www.close()

save_filename = 'fold_4.txt'
www = open(save_filename, 'w')
for k in fold_4 :
    www.write(str(k).strip("[""]") + '\n')
www.close()

In [None]:
model = joblib.load('/content/mlp_4.pkl')
for k, (train_index, val_index) in enumerate(skf.split(data, y)): # 5번
  print(k,"-fold")
  X_train = prediction_day_augmentation(train_data_origin.iloc[train_index,:].reset_index(drop= True))
  y_train = X_train["DSL"]
  X_train = X_train.drop("DSL",axis=1)

  X_val = train_data_origin.iloc[val_index,:].reset_index(drop= True)
  y_val = X_val["DSL"]
  X_val = X_val.drop("DSL",axis=1)

  test_data_xdsl = test_data.drop("DSL", axis=1)

  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_val  = scaler.transform(X_val)
  X_test  = scaler.transform(test_data_xdsl)

  y_pred = model.predict(X_val)
  y_prob = model.predict_proba(X_val)[:,1]
  
  cf = confusion_matrix(y_val, y_pred)
  print(cf)
  score = sum([compute_ndcg(np.array(y_prob), y_val, i) for i in [10, 50, 100]])
  print(score)
  
  print("-------------------------------------------------")
  print(" ")

모델 언제까지 학습해 언제저장해 val로 

In [None]:
model