<a href="https://colab.research.google.com/github/SoA-Lee/Moon-Study/blob/master/wine_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#구글 드라이브 마운드
from google.colab import drive
drive.mount('/gdrive/')

Mounted at /gdrive/


In [None]:
#필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

#랜덤 시드 고정
SEED=12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
print("시드 고정:", SEED)

시드 고정: 12


In [None]:
#데이콘 사이트에서 다운로드한 csv 파일 읽어오기
drive_path='/gdrive/My Drive/'

train = pd.read_csv(drive_path+"wine/train.csv")
test = pd.read_csv(drive_path+"wine/test.csv")
submission = pd.read_csv(drive_path+"wine/sample_submission.csv")

print(train.shape,test.shape,submission.shape)

(5497, 14) (1000, 13) (1000, 2)


In [None]:
train.head(2)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red


In [None]:
submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [None]:
train['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [None]:
train['type']=np.where(train['type']=='white',1,0).astype(int)
test['type']=np.where(test['type']=='white',1,0).astype(int)
train['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

In [None]:
train['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(train.loc[:,'quality']-3)
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
#피처 선택
x_train = train.loc[:,'fixed acidity':]
x_test = test.loc[:,'fixed acidity':]

#피쳐 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

print(x_train_scaled.shape,y_train.shape)
print(x_test_scaled.shape)

(5497, 12) (5497, 7)
(1000, 12)


In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
   model = Sequential()
   model.add(Dense(128, activation='tanh', input_dim=train_data.shape[1]))
   model.add(Dropout(0.2))
   model.add(Dense(64, activation='tanh'))
   model.add(Dropout(0.2))
   model.add(Dense(32, activation='tanh'))
   model.add(Dense(train_target.shape[1], activation='softmax'))
   model.compile(optimizer='RMSProp', loss='categorical_crossentropy',
      metrics=['acc','mae'])

   return model

model = build_model(x_train_scaled,y_train)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 128)               1664      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 7)                 231       
Total params: 12,231
Trainable params: 12,231
Non-trainable params: 0
__________________________________________________

In [None]:
#Early Stopping 기법
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

x_tr, x_val, y_tr,y_val = train_test_split(x_train_scaled,y_train,test_size=0.15, shuffle=True, random_state=SEED)

early_stopping = EarlyStopping(monitor='val_loss',patience=10)
history = model.fit(x_tr, y_tr, batch_size=64,epochs=200,validation_data=(x_val,y_val),
                    callbacks=[early_stopping],
                    verbose=2)

Epoch 1/200
73/73 - 1s - loss: 1.2872 - acc: 0.4664 - mae: 0.1917 - val_loss: 1.1571 - val_acc: 0.5091 - val_mae: 0.1787
Epoch 2/200
73/73 - 0s - loss: 1.1722 - acc: 0.5002 - mae: 0.1777 - val_loss: 1.1028 - val_acc: 0.5406 - val_mae: 0.1726
Epoch 3/200
73/73 - 0s - loss: 1.1428 - acc: 0.5182 - mae: 0.1738 - val_loss: 1.0799 - val_acc: 0.5491 - val_mae: 0.1683
Epoch 4/200
73/73 - 0s - loss: 1.1277 - acc: 0.5203 - mae: 0.1722 - val_loss: 1.1126 - val_acc: 0.5079 - val_mae: 0.1705
Epoch 5/200
73/73 - 0s - loss: 1.1133 - acc: 0.5274 - mae: 0.1708 - val_loss: 1.0650 - val_acc: 0.5539 - val_mae: 0.1675
Epoch 6/200
73/73 - 0s - loss: 1.1044 - acc: 0.5287 - mae: 0.1705 - val_loss: 1.0584 - val_acc: 0.5491 - val_mae: 0.1659
Epoch 7/200
73/73 - 0s - loss: 1.0990 - acc: 0.5336 - mae: 0.1699 - val_loss: 1.0554 - val_acc: 0.5503 - val_mae: 0.1652
Epoch 8/200
73/73 - 0s - loss: 1.0983 - acc: 0.5253 - mae: 0.1696 - val_loss: 1.0602 - val_acc: 0.5539 - val_mae: 0.1649
Epoch 9/200
73/73 - 0s - loss: 1

In [None]:
model.evaluate(x_val,y_val)



[1.016330361366272, 0.5721212029457092, 0.15763208270072937]

In [None]:
#test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(x_test)
y_pred_proba[:5]

array([[0.40388593, 0.00575143, 0.00817167, 0.02971075, 0.020928  ,
        0.09292078, 0.43863142],
       [0.5774761 , 0.00433352, 0.00770558, 0.02149555, 0.04819646,
        0.08464994, 0.25614282],
       [0.47045806, 0.00515812, 0.00643516, 0.02009124, 0.02587503,
        0.09177318, 0.38020927],
       [0.29328004, 0.00366787, 0.00226387, 0.00620042, 0.00696496,
        0.03489783, 0.65272504],
       [0.59020454, 0.00525206, 0.00877431, 0.02565144, 0.0483153 ,
        0.08779415, 0.23400821]], dtype=float32)

In [None]:
y_pred_label = np.argmax(y_pred_proba,axis=-1)+3
y_pred_label[:5]

array([9, 3, 3, 9, 3])

In [None]:
#제출 양식에 맞게 정리
submission['quality']= y_pred_label.astype(int)
submission.head()

Unnamed: 0,index,quality
0,0,9
1,1,3
2,2,3
3,3,9
4,4,3


In [None]:
#제출파일 저장
submission.to_csv(drive_path+"wine/wine_dnn_001.csv",index=False)