In [13]:
import keras
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from model.model_gen import Deep_Model1, Deep_Model2

In [2]:
train_x_name = "preprocess/MLSMOTE_x_train.csv"
val_x_name = "preprocess/val_preprocess.csv"
train_y_name = "preprocess/MLSMOTE_y_train.csv"
val_y_name = "preprocess/en_val_y.csv"

In [3]:
raw_dataframe = pd.read_csv(train_x_name) #판다스이용 csv파일 로딩
raw_dataframe.info() # 데이터 정보 출력
del raw_dataframe['cst_id_di'] # 첫째열와 행 제거
x_train = raw_dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654948 entries, 0 to 654947
Columns: 227 entries, cst_id_di to VAR227
dtypes: float64(226), int64(1)
memory usage: 1.1 GB


In [4]:
raw_dataframe = pd.read_csv(val_x_name) #판다스이용 csv파일 로딩
raw_dataframe.info() # 데이터 정보 출력
del raw_dataframe['cst_id_di'] # 첫째열와 행 제거
x_valid = raw_dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28543 entries, 0 to 28542
Columns: 227 entries, cst_id_di to VAR227
dtypes: float64(198), int64(29)
memory usage: 49.4 MB


In [5]:
raw_dataframe = pd.read_csv(train_y_name) #판다스이용 csv파일 로딩
raw_dataframe.info() # 데이터 정보 출력
del raw_dataframe['cst_id_di'] # 첫째열와 행 제거
y_train = raw_dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654948 entries, 0 to 654947
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   cst_id_di     654948 non-null  int64
 1   MRC_ID_DI_0   654948 non-null  int64
 2   MRC_ID_DI_1   654948 non-null  int64
 3   MRC_ID_DI_2   654948 non-null  int64
 4   MRC_ID_DI_3   654948 non-null  int64
 5   MRC_ID_DI_4   654948 non-null  int64
 6   MRC_ID_DI_5   654948 non-null  int64
 7   MRC_ID_DI_6   654948 non-null  int64
 8   MRC_ID_DI_7   654948 non-null  int64
 9   MRC_ID_DI_8   654948 non-null  int64
 10  MRC_ID_DI_9   654948 non-null  int64
 11  MRC_ID_DI_10  654948 non-null  int64
dtypes: int64(12)
memory usage: 60.0 MB


In [6]:
raw_dataframe = pd.read_csv(val_y_name) #판다스이용 csv파일 로딩
raw_dataframe.info() # 데이터 정보 출력
del raw_dataframe['cst_id_di'] # 첫째열와 행 제거
y_valid = raw_dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28543 entries, 0 to 28542
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   cst_id_di     28543 non-null  int64
 1   MRC_ID_DI_0   28543 non-null  int64
 2   MRC_ID_DI_1   28543 non-null  int64
 3   MRC_ID_DI_2   28543 non-null  int64
 4   MRC_ID_DI_3   28543 non-null  int64
 5   MRC_ID_DI_4   28543 non-null  int64
 6   MRC_ID_DI_5   28543 non-null  int64
 7   MRC_ID_DI_6   28543 non-null  int64
 8   MRC_ID_DI_7   28543 non-null  int64
 9   MRC_ID_DI_8   28543 non-null  int64
 10  MRC_ID_DI_9   28543 non-null  int64
 11  MRC_ID_DI_10  28543 non-null  int64
dtypes: int64(12)
memory usage: 2.6 MB


In [7]:
rs=RobustScaler()
x_train2 = rs.fit_transform(x_train)
x_valid2 = rs.fit_transform(x_valid)

In [8]:
np_x_train=np.array(x_train)
np_y_train=np.array(y_train)
np_x_valid=np.array(x_valid)
np_y_valid=np.array(y_valid)

np_x_train2=np.array(x_train2)
np_x_valid2=np.array(x_valid2)

In [9]:
model = Deep_Model1()
model2 = Deep_Model2()

In [10]:
filename = 'model/best.h5'
rl=ReduceLROnPlateau(patience = 5, verbose=1, factor=0.5)
es=EarlyStopping(patience=20,verbose=1) # val_loss가 안좋아지면 멈춤, patience : val_loss값이 이전보다 감소하면 멈춤, 몇 번 감소하는지 설정
mc=ModelCheckpoint(filename,save_best_only=True,verbose=1) # verbose : 분석 과정에 결과 띄움

In [14]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.005),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model2.compile(optimizer=keras.optimizers.Adam(lr=0.005),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [15]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in mskf.split(x_train,y_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = np_x_train[train_index], np_x_train[test_index]
    Y_train, Y_valid = np_y_train[train_index], np_y_train[test_index]
    
    model.fit(X_train,Y_train,epochs=30,validation_data=(np_x_valid,np_y_valid),callbacks=[es,mc,rl])



TRAIN: [     1      2      3 ... 654945 654946 654947] TEST: [     0     12     17 ... 654931 654932 654941]
Train on 524388 samples, validate on 28543 samples
Epoch 1/30

KeyboardInterrupt: 

In [16]:
filename = 'model/best2.h5'
rl=ReduceLROnPlateau(patience = 5, verbose=1, factor=0.5)
es=EarlyStopping(patience=20,verbose=1) # val_loss가 안좋아지면 멈춤, patience : val_loss값이 이전보다 감소하면 멈춤, 몇 번 감소하는지 설정
mc=ModelCheckpoint(filename,save_best_only=True,verbose=1) # verbose : 분석 과정에 결과 띄움

In [18]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in mskf.split(x_train2,y_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = np_x_train2[train_index], np_x_train2[test_index]
    Y_train, Y_valid = np_y_train[train_index], np_y_train[test_index]
    
    model.fit(X_train,Y_train,epochs=30,validation_data=(np_x_valid2,np_y_valid),callbacks=[es,mc,rl])



TRAIN: [     1      2      3 ... 654945 654946 654947] TEST: [     0     12     17 ... 654931 654932 654941]
Train on 524388 samples, validate on 28543 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.48677, saving model to best2.h5
Epoch 2/30
 22688/524388 [>.............................] - ETA: 36s - loss: 0.4861 - accuracy: 0.7623

KeyboardInterrupt: 