## Baseline Code

In [169]:
import random
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [170]:
# Fixed Random-Seed(랜덤 함수들이 동일한 시드에서 같은 랜덤값을 생성함.)
def seed_everything(seed):
    random.seed(seed) # random 모듈의 시드 설정
    os.environ['PYTHONHASHSEED'] = str(seed) # 환경변수 설정
    np.random.seed(seed) # Numpy에서 제공하는 랜덤 함수들이 동일한 시드에서 같은 랜덤 값을 생성함.

    seed_everything(42) # Seed 고정

## 데이터 불러오기

In [171]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

## 데이터 전처리

In [172]:
train_df = train_df.fillna(0)

In [173]:
train_df = train_df.rename(columns={ #train_df 열이름 변경
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

train_df.drop(['num_date_time'], axis = 1, inplace =True)  # num_date_time(= building_number + date_time)이므로, 제거

test_df = test_df.rename(columns={ #test_df 열이름 변경
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity'
})

test_df.drop('num_date_time', axis = 1, inplace =True) # num_date_time(= building_number + date_time)이므로, 제거

In [174]:
# datetime을 년도, 월, 주, 시간으로 쪼갠다.
train_df['date_time'] = pd.to_datetime(train_df['date_time'], format = '%Y%m%d %H')
test_df['date_time'] = pd.to_datetime(test_df['date_time'], format = '%Y%m%d %H')



# date time feature 생성
train_df['hour'] = train_df['date_time'].dt.hour
train_df['day'] = train_df['date_time'].dt.day
train_df['month'] = train_df['date_time'].dt.month
train_df['year'] = train_df['date_time'].dt.year

test_df['hour'] = test_df['date_time'].dt.hour
test_df['day'] = test_df['date_time'].dt.day
test_df['month'] = test_df['date_time'].dt.month
test_df['year'] = test_df['date_time'].dt.year

In [175]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   building_number    204000 non-null  int64         
 1   date_time          204000 non-null  datetime64[ns]
 2   temperature        204000 non-null  float64       
 3   rainfall           204000 non-null  float64       
 4   windspeed          204000 non-null  float64       
 5   humidity           204000 non-null  float64       
 6   sunshine           204000 non-null  float64       
 7   solar_radiation    204000 non-null  float64       
 8   power_consumption  204000 non-null  float64       
 9   hour               204000 non-null  int64         
 10  day                204000 non-null  int64         
 11  month              204000 non-null  int64         
 12  year               204000 non-null  int64         
dtypes: datetime64[ns](1), float64(7), int64(5)
m

In [176]:
# 일 기준 시간 평균 소비전력
mean_power_by_hour = train_df.groupby('hour')['power_consumption'].mean()
mean_power_by_hour.describe()

count      24.000000
mean     2451.036462
std       548.862070
min      1735.193839
25%      1905.385732
50%      2500.096996
75%      3036.601163
max      3110.862278
Name: power_consumption, dtype: float64

In [177]:
# 낮 시간대에는 전력소모량이 증가하므로, 전력사용량에 따른 시간을 구분(1, 2, 3)해줌.
def hour_session(hour):
    if (7 <= hour <= 9) | (17<= hour <= 22): # 전력사용량이 2000 이상, 3000 미만
        return 1
    elif  10 <= hour <= 16: # 전력사용량이 3000 이상
        return 2
    else:
        return 3 # 전력사용량이 2000 미만(그외)

train_df['hour_session'] = train_df['hour'].apply(hour_session)
test_df['hour_session'] = test_df['hour'].apply(hour_session)

In [178]:
# 요일 추출(월요일: 0 ~ 일요일: 6)
train_df['day_of_week'] = train_df['date_time'].dt.dayofweek
test_df['day_of_week'] = test_df['date_time'].dt.dayofweek


In [179]:
# 요일별 전력 평균
mean_power_by_day_of_week = train_df.groupby('day_of_week')['power_consumption'].mean()
mean_power_by_day_of_week

day_of_week
0    2488.521606
1    2580.850905
2    2518.146239
3    2550.899919
4    2561.655405
5    2266.220305
6    2185.368371
Name: power_consumption, dtype: float64

In [180]:
# 주말에는 평일에 비해 전력소모량이 적음. (평일:1, 주말: 0)으로 구분해줌.
def day_of_week_session(day_of_week):
    if 0 <= day_of_week <= 4: # 평일: 0~4
        return 1
    else:
        return 0 # 주말: 5, 6

train_df['day_of_week_session'] = train_df['day_of_week'].apply(day_of_week_session)
test_df['day_of_week_session'] = test_df['day_of_week'].apply(day_of_week_session)

## 모델링

In [181]:
train_x = train_df.drop(columns = ['date_time', 'sunshine', 'solar_radiation', 'power_consumption'])
train_y = train_df['power_consumption']

In [182]:
x_test = test_df.drop(columns=['date_time'])

## Regression Model Fit

In [215]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [184]:
KNR_3 = KNeighborsRegressor(n_jobs= -1, n_neighbors= 3) # 이웃의 수 3
KNR_5 = KNeighborsRegressor(n_jobs = -1, n_neighbors= 5) # 이웃의 수 5 
KNR_7 = KNeighborsRegressor(n_jobs= -1, n_neighbors= 7) # 이웃의 수 7
KNR_9 = KNeighborsRegressor(n_jobs= -1, n_neighbors= 9) # 이웃의 수 9

# KFold 교차 검증
kfold = KFold(n_splits= 5, shuffle = True, random_state= 10) # 5 fold
print(np.mean(cross_val_score(KNR_3, train_x, train_y, cv = kfold, scoring= 'neg_mean_squared_error')))
print(np.mean(cross_val_score(KNR_5, train_x, train_y, cv = kfold, scoring= 'neg_mean_squared_error'))) # o
print(np.mean(cross_val_score(KNR_7, train_x, train_y, cv = kfold, scoring= 'neg_mean_squared_error')))
print(np.mean(cross_val_score(KNR_9, train_x, train_y, cv = kfold, scoring= 'neg_mean_squared_error')))

-2465443.3878222886
-2471335.879383455
-2553889.8670564815
-2634706.215491025


In [187]:
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [201]:
param = {'min_samples_split' : [30, 50, 70],
         'max_depth': [5, 6, 7],
         'n_estimators': [50, 150, 250]}
gs = GridSearchCV(estimator= RF, param_grid = param, cv = 5, refit = True, scoring = 'neg_mean_squared_error' )
gs.fit(train_x, train_y)

print('GridSearchCV 최고 -MSE: {0:.4f}'.format(gs.best_score_))
print('GridSearchCV 최적 파라미터: ' , gs.best_params_)

pred = gs.best_estimator_.predict(x_test)

GridSearchCV 최고 -MSE: -687438.0996
GridSearchCV 최적 파라미터:  {'max_depth': 7, 'min_samples_split': 50, 'n_estimators': 50}


In [209]:
model = RandomForestRegressor() # 현재 점수가장 높음.
model.fit(train_x, train_y)

RandomForestRegressor()

In [223]:
pred1 = model.predict(x_test)

In [212]:
submission['answer'] = pred1

In [213]:
submission.to_csv('./data/submission_7_30_3.csv', index = False)

array([2011.9248, 2134.0848, 2001.5376, ...,  761.088 ,  733.3704,
        439.476 ])