In [1]:
# 필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

# 랜덤 시드 고정
SEED=12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)  
print("시드 고정: ", SEED)

시드 고정:  12


In [2]:
data = pd.read_csv("data/train.csv")

print(data.shape)

(5497, 14)


In [4]:
data.head(5)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


In [5]:
data['quality']

0       5
1       5
2       5
3       6
4       6
       ..
5492    5
5493    6
5494    7
5495    5
5496    6
Name: quality, Length: 5497, dtype: int64

In [6]:
data['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [8]:
data['type'] = np.where(data['type']=='white', 1, 0).astype(int) # 3항 연산
data['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

In [9]:
data['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [41]:
data.loc[:, 'quality']

0       5
1       5
2       5
3       6
4       6
       ..
5492    5
5493    6
5494    7
5495    5
5496    6
Name: quality, Length: 5497, dtype: int64

In [10]:
from tensorflow.keras.utils import to_categorical

# quality가 3부터 시작하므로 0부터 시작하게끔 
y_data = to_categorical(data.loc[:, 'quality'] - 3) 
y_data

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
# 피처 선택
X_data = data.loc[:, 'fixed acidity':]
X_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,1
1,8.8,0.610,0.14,2.4,0.067,10.0,42.0,0.99690,3.19,0.59,9.5,0
2,7.9,0.210,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,1
3,7.0,0.210,0.31,6.0,0.046,29.0,108.0,0.99390,3.26,0.50,10.8,1
4,7.8,0.400,0.26,9.5,0.059,32.0,178.0,0.99550,3.04,0.43,10.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5492,7.7,0.150,0.29,1.3,0.029,10.0,64.0,0.99320,3.35,0.39,10.1,1
5493,6.3,0.180,0.36,1.2,0.034,26.0,111.0,0.99074,3.16,0.51,11.0,1
5494,7.8,0.150,0.34,1.1,0.035,31.0,93.0,0.99096,3.07,0.72,11.3,1
5495,6.6,0.410,0.31,1.6,0.042,18.0,101.0,0.99195,3.13,0.41,10.5,1


In [13]:
X_data.shape

(5497, 12)

In [14]:
# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_data)
X_data_scaled = scaler.fit_transform(X_data)

In [15]:
# train/test 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, 
                                                    y_data, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=12)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4397, 12) (4397, 7)
(1100, 12) (1100, 7)


In [24]:
X_train.shape[1]

12

In [32]:
# 심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
    model = Sequential()
    model.add(Dense(128, activation='tanh', input_dim=X_train.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy', 
                  metrics=['acc', 'mae'])
    return model

model = build_model(X_train, y_train)
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 128)               1664      
                                                                 
 dropout_18 (Dropout)        (None, 128)               0         
                                                                 
 dense_21 (Dense)            (None, 64)                8256      
                                                                 
 dropout_19 (Dropout)        (None, 64)                0         
                                                                 
 dense_22 (Dense)            (None, 32)                2080      
                                                                 
 dropout_20 (Dropout)        (None, 32)                0         
                                                                 
 dense_23 (Dense)            (None, 7)                

In [36]:
# model fit
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',  patience=10)
history = model.fit(X_train, y_train, batch_size=64, epochs=200,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping], 
                    verbose=2)

Epoch 1/200
69/69 - 0s - loss: 1.0385 - acc: 0.5581 - mae: 0.1633 - val_loss: 1.0290 - val_acc: 0.5555 - val_mae: 0.1628 - 199ms/epoch - 3ms/step
Epoch 2/200
69/69 - 0s - loss: 1.0310 - acc: 0.5542 - mae: 0.1628 - val_loss: 1.0315 - val_acc: 0.5600 - val_mae: 0.1615 - 178ms/epoch - 3ms/step
Epoch 3/200
69/69 - 0s - loss: 1.0330 - acc: 0.5611 - mae: 0.1624 - val_loss: 1.0324 - val_acc: 0.5627 - val_mae: 0.1611 - 198ms/epoch - 3ms/step
Epoch 4/200
69/69 - 0s - loss: 1.0351 - acc: 0.5597 - mae: 0.1627 - val_loss: 1.0296 - val_acc: 0.5600 - val_mae: 0.1615 - 185ms/epoch - 3ms/step
Epoch 5/200
69/69 - 0s - loss: 1.0339 - acc: 0.5617 - mae: 0.1624 - val_loss: 1.0309 - val_acc: 0.5773 - val_mae: 0.1588 - 230ms/epoch - 3ms/step
Epoch 6/200
69/69 - 0s - loss: 1.0340 - acc: 0.5540 - mae: 0.1629 - val_loss: 1.0305 - val_acc: 0.5645 - val_mae: 0.1604 - 421ms/epoch - 6ms/step
Epoch 7/200
69/69 - 0s - loss: 1.0392 - acc: 0.5545 - mae: 0.1627 - val_loss: 1.0276 - val_acc: 0.5545 - val_mae: 0.1623 - 1

In [42]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, 
                                            shuffle=True, random_state=SEED)

early_stopping = EarlyStopping(monitor='acc',  patience=10)
history = model.fit(X_tr, y_tr, batch_size=12, epochs=200,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping],                    
                    verbose=2)

Epoch 1/200
330/330 - 1s - loss: 1.0375 - acc: 0.5588 - mae: 0.1614 - val_loss: 1.0149 - val_acc: 0.5773 - val_mae: 0.1595 - 1s/epoch - 3ms/step
Epoch 2/200
330/330 - 1s - loss: 1.0461 - acc: 0.5486 - mae: 0.1626 - val_loss: 1.0143 - val_acc: 0.5523 - val_mae: 0.1611 - 1s/epoch - 4ms/step
Epoch 3/200
330/330 - 1s - loss: 1.0382 - acc: 0.5542 - mae: 0.1618 - val_loss: 1.0053 - val_acc: 0.5705 - val_mae: 0.1594 - 870ms/epoch - 3ms/step
Epoch 4/200
330/330 - 1s - loss: 1.0377 - acc: 0.5593 - mae: 0.1618 - val_loss: 1.0145 - val_acc: 0.5545 - val_mae: 0.1606 - 697ms/epoch - 2ms/step
Epoch 5/200
330/330 - 1s - loss: 1.0416 - acc: 0.5557 - mae: 0.1619 - val_loss: 1.0182 - val_acc: 0.5841 - val_mae: 0.1610 - 715ms/epoch - 2ms/step
Epoch 6/200
330/330 - 1s - loss: 1.0453 - acc: 0.5534 - mae: 0.1621 - val_loss: 1.0199 - val_acc: 0.5682 - val_mae: 0.1646 - 720ms/epoch - 2ms/step
Epoch 7/200
330/330 - 1s - loss: 1.0418 - acc: 0.5618 - mae: 0.1624 - val_loss: 1.0295 - val_acc: 0.5523 - val_mae: 0.

In [43]:
# accuracy
loss, acc, _ = model.evaluate(X_val, y_val)
print('loss>> ', loss)
print('acc>> ', acc)

loss>>  1.0272170305252075
acc>>  0.550000011920929


In [39]:
# test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_test)
y_pred_proba[:5]

array([[1.3587170e-04, 2.2005385e-02, 6.3287407e-01, 3.1796753e-01,
        2.5135366e-02, 1.8777333e-03, 3.9795873e-06],
       [1.3264662e-04, 1.0209903e-02, 3.2492733e-01, 5.5394942e-01,
        1.0508674e-01, 5.6880964e-03, 5.9419576e-06],
       [4.0983381e-05, 3.7037125e-03, 1.3613890e-01, 6.9345015e-01,
        1.5158464e-01, 1.5080238e-02, 1.4368619e-06],
       [1.4799202e-03, 8.9490734e-02, 6.9890124e-01, 1.9954677e-01,
        9.4318064e-03, 1.1462185e-03, 3.3364083e-06],
       [6.9129834e-05, 1.1603856e-02, 4.3869704e-01, 4.8330644e-01,
        6.1803956e-02, 4.5162910e-03, 3.2725029e-06]], dtype=float32)

In [40]:
# 확률을 타겟값으로 변경 => np.argmax
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3 
y_pred_label[:5]

array([5, 6, 6, 5, 6])