In [1]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')


In [2]:
def read_csv_by_dir(path):
    df_raw = pd.DataFrame()
    for files in sorted(os.listdir(path)):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]), index_col = 0)
            df.index = pd.to_datetime(df.index, format = '%Y-%m-%d %H:%M')

        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [3]:
path = './data'

_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']))
_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']))
_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']), index_col= 0)
_submission_raw.index = pd.to_datetime(_submission_raw.index, format = '%Y-%m-%d %H:%M')

#Copy raw data
df_rf = _df_rf_raw.copy()
df_water = _df_water_raw.copy()
submission = _submission_raw.copy()

df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [4]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print('=========')
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

2012-05-01 00:00:00
2022-07-18 23:50:00
2012-05-01 00:00:00
2022-07-18 23:50:00
2022-06-01 00:00:00
2022-07-18 23:50:00


In [5]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)

# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
_target = target.reset_index(drop=True)
_data = data.reset_index(drop=True)

_data.index += 1

tot=pd.concat((_data,_target),axis=1)
tot=tot.sort_index()

tot=tot.iloc[1:-1]

target = tot.loc[:,submission.columns]
data = tot.drop(submission.columns,axis=1)

In [6]:
train_target=target.iloc[:-len(submission),:]
test_target=target.iloc[-len(submission):,:]

train_data=data.iloc[:-len(submission),:]
test_data=data.iloc[-len(submission):,:]


In [22]:
train_target.fillna(method = 'ffill',inplace=True)
test_target.fillna(method = 'ffill',inplace=True)
train_data.fillna(method = 'ffill',inplace=True)
test_data.fillna(method = 'ffill',inplace=True)

test_data.fillna(train_data.mean(), inplace=True)
test_data.isna().sum()

rf_10184100    0
rf_10184110    0
rf_10184140    0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     0
fw_1018662     0
fw_1018680     0
fw_1018683     0
fw_1019630     0
dtype: int64

In [8]:
print('--data--')
print(train_data.shape)
print(test_data.shape)
print('--target--')
print(train_target.shape)
print(test_target.shape)

--data--
(269423, 13)
(6912, 13)
--target--
(269423, 4)
(6912, 4)


In [12]:
from sklearn.model_selection import KFold
# kfold = KFold(n_splits=2, shuffle=False)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=2)

params = {
    "n_estimators" : (100, 150, 200)
}

from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(rf,
                       param_grid=params,
                    #    cv = kfold,
                       n_jobs=-1)
 d

In [13]:
grid_cv.fit(train_data,train_target)

GridSearchCV(estimator=RandomForestRegressor(n_jobs=2), n_jobs=-1,
             param_grid={'n_estimators': (100, 150, 200)})

In [14]:
grid_cv.best_estimator_

RandomForestRegressor(n_estimators=200, n_jobs=2)

In [23]:
model=grid_cv.best_estimator_
model.fit(train_data,train_target)
y_pred=model.predict(test_data)

_submission_raw.iloc[:,:] = y_pred
_submission_raw.to_csv('ans.csv')

In [24]:
import pickle

with open('data/model', 'wb') as f:
    pickle.dump(model, f)


In [None]:
with open('data/model', 'rb') as f:
    rf = pickle.load(f)