In [61]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout, Conv2D, Flatten
from sklearn.preprocessing import MinMaxScaler as MMS, StandardScaler as SDS
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd

# 1. 데이터

In [66]:

path = 'C:/study/keras/keras_data/ddarung/'

train_data = pd.read_csv(path + 'train.csv', index_col = 0) # index_col = 0 → id 열 데이터로 취급 X
test_data = pd.read_csv(path + 'test.csv', index_col = 0)
submission = pd.read_csv(path + 'submission.csv', index_col = 0)

# ---------------------- 결측치 처리 (제거)----------------------- #
train_data = train_data.dropna()

# ---------------------- x,y 분리 ------------------------ #
x = train_data.drop(['count'], axis=1)  # y 값(count 열) 분리, axis = 1 → 열에 대해 동작
y = train_data['count']                 # y 값(count 열)만 추출

# print(train_data.shape)        # (1459, 10)
# print(test_data.shape)         # (715, 9)
# print(train_data.columns)      'hour', 'hour_bef_temperature', 'hour_bef_precipitation',
#                               'hour_bef_windspeed', 'hour_bef_humidity', 'hour_bef_visibility',
#                               'hour_bef_ozone', 'hour_bef_pm10', 'hour_bef_pm2.5', 'count'
# print(train_data.info())       # Missing Attribute Values: 결측치 - 데이터에 값이 없는 것
# print(train_data.describe())   # 평균, 표준편차, 최대값 등

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=44)

scaler = MMS()
# scaler = SDS()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
test_data = scaler.transform(test_data)

print("x_train: ", x_train.shape, "x_test:", x_test.shape)

# # ---------- CNN 모델에 적용해보기 위해 4차원으로 변환 ----------- #
x_train = x_train.reshape(-1, 9, 1, 1)
x_test = x_test.reshape(-1, 9, 1, 1)
test_data = test_data.reshape(-1, 9, 1, 1)

print("x_train: ", x_train.shape, "x_test:", x_test.shape)


x_train:  (929, 9) x_test: (399, 9)
x_train:  (929, 9, 1, 1) x_test: (399, 9, 1, 1)


# 2. 모델


In [63]:
model = Sequential()
model.add(Conv2D(32, (2,1), input_shape = (9, 1, 1), activation='relu'))
model.add(Dropout(0.5)) # 과적합 방지
model.add(Conv2D(16, (2,1), activation='relu'))
model.add(Dropout(0.3)) # 과적합 방지
model.add(Flatten())    # DNN모델에 적용하기 위해 2차원으로 변환
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2)) # 과적합 방지
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3)) # 과적합 방지
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5)) # 과적합 방지
model.add(Dense(128, activation='relu'))
model.add(Dense(1))

# 3. 컴파일 및 훈련

In [64]:
model.compile(loss='mse', optimizer='adam') # one-hot encoding 하지 않아도 되는 데이터이므로 loss= sparse_categorical_crossentropy

ES = EarlyStopping(monitor='val_loss', mode='auto', patience=4, restore_best_weights=True) 
model.fit(x_train, y_train, epochs=64, batch_size=8, validation_split=0.2, callbacks = [ES], verbose=2) # verbose: 함수 수행시 발생하는 상세한 정보들을 표준 출력으로 자세히 내보낼 것인지

Epoch 1/64


  return t[start:end]


93/93 - 1s - loss: 10456.4639 - val_loss: 4729.0308 - 727ms/epoch - 8ms/step
Epoch 2/64
93/93 - 0s - loss: 5519.7495 - val_loss: 3899.2231 - 173ms/epoch - 2ms/step
Epoch 3/64
93/93 - 0s - loss: 4888.3965 - val_loss: 3338.2578 - 194ms/epoch - 2ms/step
Epoch 4/64
93/93 - 0s - loss: 4606.8491 - val_loss: 3118.8027 - 158ms/epoch - 2ms/step
Epoch 5/64
93/93 - 0s - loss: 4907.0791 - val_loss: 2998.6780 - 158ms/epoch - 2ms/step
Epoch 6/64
93/93 - 0s - loss: 4253.4038 - val_loss: 3020.2632 - 154ms/epoch - 2ms/step
Epoch 7/64
93/93 - 0s - loss: 4517.2007 - val_loss: 2602.3010 - 156ms/epoch - 2ms/step
Epoch 8/64
93/93 - 0s - loss: 4147.7642 - val_loss: 2997.7283 - 154ms/epoch - 2ms/step
Epoch 9/64
93/93 - 0s - loss: 3880.4873 - val_loss: 2694.3796 - 161ms/epoch - 2ms/step
Epoch 10/64
93/93 - 0s - loss: 3526.1389 - val_loss: 2422.2131 - 156ms/epoch - 2ms/step
Epoch 11/64
93/93 - 0s - loss: 3900.6306 - val_loss: 2478.4453 - 156ms/epoch - 2ms/step
Epoch 12/64
93/93 - 0s - loss: 3765.1975 - val_loss

<keras.callbacks.History at 0x1f4b0dadcd0>

# 4. 평가 및 예측

In [65]:
loss = model.evaluate(x_test, y_test, verbose=2)
print('loss(mse): ', loss)

y_predict = model.predict(x_test)

RMSE = np.sqrt(mean_squared_error(y_test, y_predict))
print("RMSE: ", RMSE)

r2 = r2_score(y_test, y_predict)
print("R2: ", r2)

y_submit = model.predict(test_data)
submission['count'] = y_submit
submission.to_csv(path + 'submission_0125.csv')

13/13 - 0s - loss: 2729.2253 - 23ms/epoch - 2ms/step
loss(mse):  2729.225341796875
RMSE:  52.24198865943341
R2:  0.5982468264419787
