# Modules

In [4]:
import sys, os, datetime
sys.path.append("../src")

import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from dataset import ds_generator
from config import data_configuration
from exp_helpers import result_path, format_results

## config

In [5]:
# get calucurete index
df = pd.read_pickle('data_details.pkl')
idxs = df[~(df['lack of data']) & (df['network exist']) & ~(df['other dams']) & (df['gauge available'])].index

In [6]:
# model setting
mlp = MLPRegressor(max_iter=1000,learning_rate_init=0.005,random_state=2021)
rf = RandomForestRegressor(random_state=2021)
xgboost = xgb.XGBRegressor()
# model = lgb.LGBMRegressor(seed=0)
reg_cv = GridSearchCV(xgboost, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1)

models = {'mlp':mlp, 'rf':rf, 'xgb':xgboost}

## Prediction

In [None]:
mean = lambda x: x.mean(1) if isinstance(x, pd.DataFrame) else x
results = {}

for model_name, model in models.items():
    config_args = data_configuration(rain_obs='msm', pca_exec=False, model=model_name)
    
    # make directory
    dirname = result_path(config_args)
    os.makedirs(dirname, exist_ok=True)
    
    for (idx, te_ds, train_x, train_y, train_date, val_x, val_y, val_date,
         test_x, test_y, test_date) in ds_generator(idxs, **config_args):


        if isinstance(model, xgb.XGBRegressor):
            reg_cv.fit(val_x, val_y)
            model = xgb.XGBRegressor(**reg_cv.best_params_)


        model.fit(train_x, train_y)
        predict = model.predict(test_x)
        result = format_results(predict, test_y, te_ds, test_date)
        result.to_pickle(f"{dirname}/{idx}.pkl")

  0%|          | 0/86 [00:00<?, ?it/s]