In [54]:
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam, SGD, RMSprop

from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from keras.callbacks import EarlyStopping, ModelCheckpoint


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk2')

In [3]:
train_pkl.shape

(850, 9)

## 訓練データとテストデータに分割

In [6]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [7]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, :-1]
train_y = train.disease

In [8]:
train_X.head(5)

Unnamed: 0,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio
332,2.066406,0.631348,213.935623,14.563273,47.154297,7.503906,2.302734,0.759766
383,0.817871,0.197021,214.644638,15.622564,21.059477,7.511719,3.630859,1.291016
281,0.791992,0.082642,358.339508,12.924613,25.77248,8.648438,4.324219,1.319336
2,0.657227,0.081299,320.770538,12.625011,30.61318,5.949219,2.488281,0.774902
231,1.714844,0.51123,215.885971,27.66971,60.709866,5.953125,3.117188,1.21875


In [14]:
train.shape, train_X.shape, test.shape, train_X.shape[1]

((680, 9), (680, 8), (170, 9), 8)

In [38]:
stscalar = preprocessing.StandardScaler(copy=True)

In [39]:
train_X = stscalar.fit_transform(train_X)

In [41]:
train_X[:5]

array([[ 0.19871783,  0.01434054, -0.2921998 , -0.19679274, -0.1496765 ,
         0.5649345 , -2.2101626 , -1.7243654 ],
       [-0.30765444, -0.25215372, -0.2887634 , -0.18990234, -0.37044647,
         0.5744017 ,  0.2013999 ,  0.6564976 ],
       [-0.31815022, -0.32233506,  0.40768307, -0.20745178, -0.33057308,
         1.9518797 ,  1.4603773 ,  0.78341866],
       [-0.37280753, -0.32315895,  0.2255974 , -0.20940061, -0.2896193 ,
        -1.3190391 , -1.8732531 , -1.6565282 ],
       [ 0.05613354, -0.05936107, -0.28274703, -0.11153886, -0.03499235,
        -1.3143054 , -0.7313074 ,  0.33263022]], dtype=float32)

## 訓練データで訓練

In [78]:
model = Sequential()

model.add(Dense(64, input_dim=train_X.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 64)                576       
_________________________________________________________________
dropout_16 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_17 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 65        
Total params: 4,801
Trainable params: 4,801
Non-trainable params: 0
_________________________________________________________________


In [79]:
model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop', metrics=['accuracy'])

In [80]:
model.fit(train_X, train_y, epochs=20, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x2ccf36241d0>

In [81]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, :-1]
test_y = test.disease

In [82]:
test_x.shape, test_y.shape

((170, 8), (170,))

In [83]:
test_x = stscalar.fit_transform(test_x)

In [84]:
test_x[:5]

array([[-0.32964182, -0.35617512, -0.241994  , -0.17400771, -0.31449765,
         0.62773293,  2.3357067 ,  2.7598972 ],
       [-0.40658566, -0.27879038, -0.30152515, -0.16048895, -0.32632026,
        -0.3385221 ,  0.18808387,  2.3165574 ],
       [-0.4215991 , -0.37365517, -0.28713858, -0.1335774 , -0.32351726,
         0.63208544,  0.2143947 , -0.43618   ],
       [-0.4059601 , -0.30354595, -0.29904044, -0.13072777, -0.393231  ,
         0.5624454 ,  0.19466159,  0.59156233],
       [-0.3767673 , -0.30694747, -0.2303786 , -0.08248614,  0.06129562,
        -0.3254646 , -0.72950697,  0.5109551 ]], dtype=float32)

In [85]:
score = model.evaluate(test_x, test_y, batch_size=128)



In [86]:
score

[0.4004981742185705, 0.8117647171020508]

In [87]:
pred_y = model.predict_classes(test_x)

In [88]:
confusion_matrix(test_y, pred_y)

array([[90,  3],
       [29, 48]], dtype=int64)

In [89]:
accuracy_score(test_y, pred_y)

0.8117647058823529

In [53]:
pred_y.sum(), test_y.sum()

(53, 77)

In [212]:
model = Sequential()

model.add(Dense(64, input_dim=train_X.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_82 (Dense)             (None, 64)                576       
_________________________________________________________________
dropout_59 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_83 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_60 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_84 (Dense)             (None, 1)                 129       
Total params: 9,025
Trainable params: 9,025
Non-trainable params: 0
_________________________________________________________________


In [213]:
model.compile(loss='binary_crossentropy', 
              optimizer='Nadam', metrics=['accuracy'])

In [214]:
model.fit(train_X, train_y, epochs=20, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x2cc85b4b5c0>

In [215]:
score = model.evaluate(test_x, test_y, batch_size=128)



In [216]:
score

[0.3337897349806393, 0.8647058606147766]

In [217]:
pred_y = model.predict_classes(test_x)

In [218]:
confusion_matrix(test_y, pred_y)

array([[90,  3],
       [20, 57]], dtype=int64)

In [219]:
accuracy_score(test_y, pred_y)

0.8647058823529412

## 検証データで実行

In [220]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk2')

In [222]:
valid = stscalar.fit_transform(valid)

In [223]:
valid.shape

(350, 8)

In [224]:
valid[:5]

array([[-0.29095116, -0.21756063, -0.33720767, -0.20416994, -0.3664311 ,
        -0.3113834 ,  0.0659775 ,  0.5387072 ],
       [-0.2793132 , -0.25788414, -0.23926638, -0.26089007, -0.39051202,
        -0.33366144, -0.7424079 , -0.50195736],
       [-0.29437408, -0.21112007, -0.32929194, -0.21231398, -0.35598063,
         0.4950818 ,  0.10784607,  0.4992881 ],
       [-0.2796555 , -0.22386117, -0.4046019 , -0.23329978, -0.36115134,
        -0.29801658,  0.10784607,  0.4874624 ],
       [-0.3299725 , -0.24696319, -0.36577624, -0.14426076, -0.31762996,
         1.7961196 ,  1.00641   ,  0.5071719 ]], dtype=float32)

In [225]:
# ID の保存
# valid_pass = valid.id.values

In [226]:
# valid_X = valid.iloc[:, 1:]
valid_X = valid.copy()

In [227]:
valid_X.shape, train_X.shape

((350, 8), (680, 8))

In [228]:
pred_valid_y = model.predict(valid_X)

In [229]:
pred_valid_y.shape

(350, 1)

In [230]:
# type(valid_pass), type(pred_valid_y)

In [234]:
valid_y = model.predict_classes(valid_X)

In [235]:
result_df = pd.DataFrame(valid_y)

In [236]:
result_df.to_csv("./MLP_1.csv", header=False)

In [237]:
result_df

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
345,0
346,0
347,0
348,0


In [243]:
result_df.sum()

0    120
dtype: int64