In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
sample_df = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])
sample_df.index = pd.to_datetime(sample_df.index)
sample_df.head()

In [None]:
df_holiday = pd.read_csv("../datasets/Thailand_Holidays.csv")
df_holiday.head()

In [None]:
df_holiday = df_holiday.loc[(df_holiday.Type == 'Government Holiday') | (df_holiday.Type == 'National Holiday'), :]

In [None]:
df_holiday['month']= df_holiday['Date'].apply(lambda x : int(x.split("/")[0]))
df_holiday['day']= df_holiday['Date'].apply(lambda x : int(x.split("/")[1]))

In [None]:
def check_holiday(df_holiday, date):
    day, month = int(date.day), int(date.month)
    res = df_holiday.loc[(df_holiday.month == month) & (df_holiday.day == day)].shape[0]

    return int(res > 0)

In [None]:
sample_df['output'] = sample_df['z2_AC1(kW)']
# sample_df.drop(columns = ['z2_AC1(kW)', 'frame_id'], inplace=True)
sample_df.drop(columns = ['frame_id'], inplace=True)

In [None]:
def apply_m2(data):
    data.loc[:, 'month'] = data.index.month
    data.loc[:, 'DoW'] = data.index.dayofweek
    data.loc[:, 'hour'] = data.index.hour
    data.loc[:, 'holiday'] = data.index.to_series().apply(lambda x: check_holiday(df_holiday, x))
    
    return data

In [None]:
def produce_training_validation_dfs(all_df):
    training_set, validation_set = train_test_split(all_df, test_size=0.3, random_state=0)
    
    scaler = StandardScaler()
    scaled_train_df = pd.DataFrame(np.c_[scaler.fit_transform(training_set.drop(columns = ['output'])), training_set.output.values], columns = sampling_df.columns)
    scaled_validation_df = pd.DataFrame(np.c_[scaler.fit_transform(validation_set.drop(columns = ['output'])), validation_set.output.values], columns = validation_set.columns)
    
    scaled_train_df.index = training_set.index
    scaled_validation_df.index = validation_set.index 
    scaled_train_df, scaled_validation_df = apply_m2(scaled_validation_df), apply_m2(scaled_validation_df)
    
    scaled_train_df = pd.get_dummies(scaled_train_df,columns = ['month', 'DoW', 'hour'], drop_first=True)
    scaled_validation_df = pd.get_dummies(scaled_validation_df,columns = ['month', 'DoW', 'hour'], drop_first=True)
    
    return scaled_train_df, scaled_validation_df

In [None]:
def produce_shifted_df(sample_df, future):
    new_df = sample_df.copy()
    new_df.loc[:, 'output'] = new_df.loc[:, 'output'].shift(periods = -future)
    
    new_df.dropna(inplace=True)
    return new_df

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error


algorith_parm_map = {
    'rf': {'max_depth': [3, 5, 10],
               'min_samples_split': [2, 5, 10]},
    'gboost': {'max_depth': [3, 5, 10],
               'min_samples_split': [2, 5, 10]},
    'lasso': {'alpha': np.logspace(-4, -0.5, 10)},
    'ridge': {'alpha': np.logspace(-4, -0.5, 10)}
}

future_values = [5, 10, 15, 20]
alg_best = pd.DataFrame()
sampling_df = sample_df.sample(n = 100000, random_state = 0)

for f in tqdm(future_values):
    dict_result = {}
    dict_result['future'] = f
    
    scaled_train_df, scaled_validation_df = produce_training_validation_dfs(sampling_df)
    
    shifted_df_train, shifted_df_validation = produce_shifted_df(scaled_train_df, f), produce_shifted_df(scaled_validation_df, f)

    for alg in tqdm(algorith_parm_map.keys()):

        param_grid = algorith_parm_map[alg]
        base_estimator = None
        if alg == 'rf':
            base_estimator = RandomForestRegressor(random_state = 0)
        elif alg == 'gboost':
            base_estimator = GradientBoostingRegressor(random_state = 0)
        elif alg == 'lasso':
            base_estimator = Lasso(random_state = 0)
        else:
            base_estimator = Ridge(random_state = 0)

        cv_regressor = GridSearchCV(base_estimator, param_grid, scoring = 'neg_mean_absolute_error', cv = 3, verbose=1)
        X_train, y_train = shifted_df_train.drop(columns  =['output']), shifted_df_train.output.values
        X_validation, y_validation = shifted_df_validation.drop(columns  =['output']), shifted_df_validation.output.values

        cv_regressor.fit(X_train, y_train)
        results_df = pd.DataFrame.from_dict(cv_regressor.cv_results_)
        y_predictions = cv_regressor.predict(X_validation)
        dict_result[alg] = mean_absolute_error(y_validation, y_predictions)

        results_df.to_csv(f"../results/stats/2019_z2_Floor6_m2_{alg}_f-{f}_training.csv")
    alg_best = pd.concat([alg_best, pd.DataFrame.from_dict([dict_result])])
alg_best.to_csv("../results/stats/2019_z2_Floor6_m2_validation.csv")