In [37]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from scipy import interpolate
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [38]:
w_list = sorted(glob("data/water_data/*.csv"))

In [39]:
# 훈련 데이터 수집
# 실험 결과 "tototf", "fw_1019630" 두가지만 사용할때가 가장 좋았음

train_data = []
train_label = []
train_data_append = train_data.append
train_label_append = train_label.append
for i in w_list[:-1]:
    tmp = pd.read_csv(i)
    tmp = tmp.replace(" ", np.nan)

    for j in tqdm(range(len(tmp)-1)):
        # 라벨이 이상하면 패스
        if (tmp.iloc[j+1]["tototf"] == 0) or (tmp.iloc[j+1]["tototf"] != tmp.iloc[j+1]["tototf"]):
            continue
        # 데이터가 이상하면 패스
        if (tmp.iloc[j]["tototf"] == 0) or (tmp.iloc[j]["tototf"] != tmp.iloc[j]["tototf"]):
            continue
        if (tmp.iloc[j]["fw_1019630"] == 0) or (tmp.iloc[j]["fw_1019630"] != tmp.iloc[j]["fw_1019630"]):
            continue

        train_data_append(tmp.iloc[j][["tototf", "fw_1019630"]])
        train_label_append(tmp.iloc[j+1]["tototf"])

100%|██████████| 26495/26495 [00:29<00:00, 898.60it/s]
100%|██████████| 26495/26495 [00:30<00:00, 880.86it/s]
100%|██████████| 26495/26495 [00:32<00:00, 817.06it/s]
100%|██████████| 26495/26495 [00:32<00:00, 827.17it/s]
100%|██████████| 26495/26495 [00:43<00:00, 606.62it/s]
100%|██████████| 26495/26495 [00:46<00:00, 571.32it/s]
100%|██████████| 26495/26495 [00:39<00:00, 666.85it/s]
100%|██████████| 26495/26495 [00:46<00:00, 574.99it/s]
100%|██████████| 26495/26495 [00:51<00:00, 512.97it/s]
100%|██████████| 26495/26495 [00:42<00:00, 624.25it/s]


In [43]:
print(np.array(train_data).shape)
print(np.array(train_label).shape)
train_data = np.array(train_data)
train_label = np.array(train_label)

(263510, 2)
(263510,)


In [57]:
# 검증

X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.2)

model = XGBRegressor()
model.fit(X_train, y_train)
print(mean_absolute_error(model.predict(X_test), y_test))

11.301017692073428


In [58]:
model = XGBRegressor()
model.fit(train_data, train_label)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
model.feature_importances_

array([0.99668646, 0.00331349], dtype=float32)

In [59]:
w_list = sorted(glob("data/water_data/*.csv"))

In [61]:
df = pd.read_csv(w_list[0], index_col=0)
for i in w_list[1:-1]:
    df = pd.concat([df, pd.read_csv(i, index_col=0)])
df_2022 = pd.read_csv(w_list[-1], index_col=0)

column = "tototf"
print(f"nan count of {column} : {df[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df[str(column)]==0).sum()}")
print("-----------")
print(f"nan count of {column} : {df_2022[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df_2022[str(column)]==0).sum()}")
impute_df = df.copy()
impute_2022 = df_2022.copy()

nan count of tototf : 707
zero count of tototf : 441
-----------
nan count of tototf : 36
zero count of tototf : 6


In [65]:
for i in tqdm(range(len(impute_df)-1)):
    if (impute_df.iloc[i+1]["tototf"] == 0) or (impute_df.iloc[i+1]["tototf"] != impute_df.iloc[i+1]["tototf"]):  # 타겟이 0이거나 nan일때
        if (impute_df.iloc[i]["tototf"] != 0) and (impute_df.iloc[i]["tototf"] == impute_df.iloc[i]["tototf"]):  # xptmxm
            if (impute_df.iloc[i]["fw_1019630"] != 0) and (impute_df.iloc[i]["fw_1019630"] == impute_df.iloc[i]["fw_1019630"]):
                impute_df.iloc[i+1]["tototf"] = model.predict(np.array(impute_df.iloc[i:i+1][["tototf", "fw_1019630"]]))

for i in tqdm(range(len(df_2022)-1)):
    if (df_2022.iloc[i+1]["tototf"] == 0) or (df_2022.iloc[i+1]["tototf"] != df_2022.iloc[i+1]["tototf"]):  # 타겟이 0이거나 nan일때
        if (df_2022.iloc[i]["tototf"] != 0) and (df_2022.iloc[i]["tototf"] == df_2022.iloc[i]["tototf"]):  # xptmxm
            if (df_2022.iloc[i]["fw_1019630"] != 0) and (df_2022.iloc[i]["fw_1019630"] == df_2022.iloc[i]["fw_1019630"]):
                df_2022.iloc[i+1]["tototf"] = model.predict(np.array(df_2022.iloc[i:i+1][["tototf", "fw_1019630"]]))


100%|██████████| 264959/264959 [00:34<00:00, 7690.87it/s]
100%|██████████| 11375/11375 [00:01<00:00, 6567.68it/s]


In [66]:
column = "tototf"
print(f"nan count of {column} : {impute_df[str(column)].isna().sum()}")
print(f"zero count of {column} : {(impute_df[str(column)]==0).sum()}")
print("-----------")
print(f"nan count of {column} : {df_2022[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df_2022[str(column)]==0).sum()}")

nan count of tototf : 0
zero count of tototf : 0
-----------
nan count of tototf : 0
zero count of tototf : 0


In [67]:
impute_df.to_csv("data_2012~2021 impute_tototf.csv")
df_2022.to_csv("data_2022 impute_tototf.csv")