In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from scipy import interpolate
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
w_list = sorted(glob("data/water_data/*.csv"))

In [3]:
# 훈련 데이터 수집
# 실험 결과 "tototf", "fw_1019630" 두가지만 사용할때가 가장 좋았음

train_data = []
train_label = []
train_data_append = train_data.append
train_label_append = train_label.append
for i in w_list[:-1]:
    tmp = pd.read_csv(i)
    tmp = tmp.replace(" ", np.nan)

    for j in tqdm(range(len(tmp)-1)):
        # 라벨이 이상하면 패스 : 0이거나, nan이거나, 20000 이상의 이상치거나
        if (tmp.iloc[j+1]["tototf"] == 0) or (tmp.iloc[j+1]["tototf"] != tmp.iloc[j+1]["tototf"]) or (tmp.iloc[j+1]["tototf"] > 20000):
            continue
        # 데이터가 이상하면 패스
        if (tmp.iloc[j]["tototf"] == 0) or (tmp.iloc[j]["tototf"] != tmp.iloc[j]["tototf"]) or (tmp.iloc[j]["tototf"] > 20000):
            continue
        if (tmp.iloc[j]["fw_1019630"] == 0) or (tmp.iloc[j]["fw_1019630"] != tmp.iloc[j]["fw_1019630"]):
            continue

        train_data_append(tmp.iloc[j][["tototf", "fw_1019630"]])
        train_label_append(tmp.iloc[j+1]["tototf"])

100%|██████████| 26495/26495 [00:38<00:00, 687.10it/s]
100%|██████████| 26495/26495 [00:40<00:00, 660.94it/s]
100%|██████████| 26495/26495 [00:47<00:00, 554.79it/s]
100%|██████████| 26495/26495 [00:50<00:00, 526.82it/s]
100%|██████████| 26495/26495 [00:39<00:00, 678.54it/s]
100%|██████████| 26495/26495 [00:42<00:00, 629.51it/s]
100%|██████████| 26495/26495 [00:42<00:00, 628.38it/s]
100%|██████████| 26495/26495 [00:41<00:00, 634.38it/s]
100%|██████████| 26495/26495 [00:44<00:00, 601.45it/s]
100%|██████████| 26495/26495 [00:44<00:00, 594.82it/s]


In [4]:
print(np.array(train_data).shape)
print(np.array(train_label).shape)
train_data = np.array(train_data)
train_label = np.array(train_label)

(263506, 2)
(263506,)


In [5]:
# 검증

X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.2)

model = XGBRegressor()
model.fit(X_train, y_train)
print(mean_absolute_error(model.predict(X_test), y_test))

11.508948059313655


In [6]:
model = XGBRegressor()
model.fit(train_data, train_label)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
model.feature_importances_

array([9.993768e-01, 6.231940e-04], dtype=float32)

In [8]:
w_list = sorted(glob("data/water_data/*.csv"))

In [9]:
df = pd.read_csv(w_list[0], index_col=0)
for i in w_list[1:-1]:
    df = pd.concat([df, pd.read_csv(i, index_col=0)])
df_2022 = pd.read_csv(w_list[-1], index_col=0)

column = "tototf"
print(f"nan count of {column} : {df[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df[str(column)]==0).sum()}")
print(f"anomaly count of {column} : {(df[str(column)] > 20000).sum()}")
print("-----------")
print(f"nan count of {column} : {df_2022[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df_2022[str(column)]==0).sum()}")
print(f"anomaly count of {column} : {(df_2022[str(column)] > 20000).sum()}")
impute_df = df.copy()
impute_2022 = df_2022.copy()

nan count of tototf : 707
zero count of tototf : 441
anomaly count of tototf : 2
-----------
nan count of tototf : 36
zero count of tototf : 6
anomaly count of tototf : 0


In [10]:
for i in tqdm(range(len(impute_df)-1)):
    if (impute_df.iloc[i+1]["tototf"] == 0) or (impute_df.iloc[i+1]["tototf"] != impute_df.iloc[i+1]["tototf"]) or (impute_df.iloc[i+1]["tototf"] > 20000):  # 타겟이 0이거나 nan이거나 이상치일때
        if (impute_df.iloc[i]["tototf"] != 0) and (impute_df.iloc[i]["tototf"] == impute_df.iloc[i]["tototf"]) and (impute_df.iloc[i]["tototf"] <= 20000):
            if (impute_df.iloc[i]["fw_1019630"] != 0) and (impute_df.iloc[i]["fw_1019630"] == impute_df.iloc[i]["fw_1019630"]):
                impute_df.iloc[i+1]["tototf"] = model.predict(np.array(impute_df.iloc[i:i+1][["tototf", "fw_1019630"]]))

for i in tqdm(range(len(df_2022)-1)):
    if (df_2022.iloc[i+1]["tototf"] == 0) or (df_2022.iloc[i+1]["tototf"] != df_2022.iloc[i+1]["tototf"]) or (df_2022.iloc[i+1]["tototf"] > 20000):  # 타겟이 0이거나 nan이거나 이상치일때
        if (df_2022.iloc[i]["tototf"] != 0) and (df_2022.iloc[i]["tototf"] == df_2022.iloc[i]["tototf"]) and (df_2022.iloc[i]["tototf"] <= 20000):  # xptmxm
            if (df_2022.iloc[i]["fw_1019630"] != 0) and (df_2022.iloc[i]["fw_1019630"] == df_2022.iloc[i]["fw_1019630"]):
                df_2022.iloc[i+1]["tototf"] = model.predict(np.array(df_2022.iloc[i:i+1][["tototf", "fw_1019630"]]))


100%|██████████| 264959/264959 [01:21<00:00, 3262.51it/s]
100%|██████████| 11375/11375 [00:03<00:00, 3336.25it/s]


In [11]:
column = "tototf"
print(f"nan count of {column} : {impute_df[str(column)].isna().sum()}")
print(f"zero count of {column} : {(impute_df[str(column)]==0).sum()}")
print(f"anomaly count of {column} : {(impute_df[str(column)] > 20000).sum()}")
print("-----------")
print(f"nan count of {column} : {df_2022[str(column)].isna().sum()}")
print(f"zero count of {column} : {(df_2022[str(column)]==0).sum()}")
print(f"anomaly count of {column} : {(df_2022[str(column)] > 20000).sum()}")

nan count of tototf : 0
zero count of tototf : 0
anomaly count of tototf : 0
-----------
nan count of tototf : 0
zero count of tototf : 0
anomaly count of tototf : 0


In [12]:
impute_df.to_csv("data_2012~2021 impute_tototf.csv")
df_2022.to_csv("data_2022 impute_tototf.csv")