In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
training_data = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])
testing_data = pd.read_csv("../datasets/generated/2018_z2_Floor6.csv", index_col=[0])

In [None]:
training_data.shape

In [None]:
training_data.head()

In [None]:
testing_data.head()

# Method 2 Analysis

In [None]:
df_holiday = pd.read_csv("../datasets/Thailand_Holidays.csv")
df_holiday = df_holiday.loc[(df_holiday.Type == 'Government Holiday') | (df_holiday.Type == 'National Holiday'), :]
df_holiday['month']= df_holiday['Date'].apply(lambda x : int(x.split("/")[0]))
df_holiday['day']= df_holiday['Date'].apply(lambda x : int(x.split("/")[1]))

In [None]:
def check_holiday(df_holiday, date):
    day, month = int(date.day), int(date.month)
    res = df_holiday.loc[(df_holiday.month == month) & (df_holiday.day == day)].shape[0]

    return int(res > 0)

In [None]:
testing_data.index = pd.to_datetime(testing_data.index)

In [None]:
print(np.max(testing_data.index) - np.min(testing_data.index))

In [None]:
training_data['output'] = training_data['z2_AC1(kW)']
training_data.drop(columns = ['frame_id', 'z2_AC1(kW)'], inplace=True)

scaler = StandardScaler()
scaled_df = pd.DataFrame(np.c_[scaler.fit_transform(training_data.drop(columns = ['output'])), training_data.output.values], columns = training_data.columns)

In [None]:
scaled_df.index = training_data.index
scaled_df.index = pd.to_datetime(scaled_df.index)

In [None]:
scaled_df.head()

In [None]:
scaled_df.loc[:, 'month'] = scaled_df.index.month
scaled_df.loc[:, 'DoW'] = scaled_df.index.dayofweek
scaled_df.loc[:, 'hour'] = scaled_df.index.hour
scaled_df.loc[:, 'holiday'] = scaled_df.index.to_series().apply(lambda x: check_holiday(df_holiday, x))
scaled_df = pd.get_dummies(scaled_df, columns = ['month', 'DoW', 'hour'], drop_first=True)
scaled_df.index = range(scaled_df.shape[0])

In [None]:
def produce_shifted_df(sample_df, future):
    new_df = sample_df.copy()
    new_df.loc[:, 'output'] = new_df.loc[:, 'output'].shift(periods = -future)
    
    new_df.dropna(inplace=True)
    return new_df

In [None]:
scaled_df.head()

In [None]:
_, bins = np.histogram(scaled_df.output.values)

In [None]:
print(bins)

In [None]:
print(np.sum((scaled_df.output.values >= bins[6]).astype(int)) / scaled_df.shape[0])

In [None]:
plt.plot(scaled_df.output.values)

In [None]:
print(scaled_df.columns)

In [None]:
print(np.unique(pd.to_datetime(training_data.index).month))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from tqdm.notebook import trange, tqdm
from sklearn.metrics import mean_absolute_error

future_values = [5, 10, 15, 20]
alg_best = pd.DataFrame()
unique_months = [f"month_{m}" for m in np.unique(pd.to_datetime(training_data.index).month)]
_, bins = np.histogram(scaled_df.output.values)
df_bins = pd.DataFrame()

for f in tqdm(future_values):
    dict_result = {}
    dict_result['future'] = f
    
    shifted_df = produce_shifted_df(scaled_df, f)
    
    for month in tqdm(unique_months[1:]):
        shifted_df_train, shifted_df_test = shifted_df.loc[shifted_df[month] == 0, :], shifted_df.loc[shifted_df[month] == 1, :]
        X_train, y_train = shifted_df_train.drop(columns = ['output']), shifted_df_train.output.values
        X_test, y_test = shifted_df_test.drop(columns = ['output']), shifted_df_test.output.values
        
        base_estimator = GradientBoostingRegressor(max_depth = 10, min_samples_split = 2)
        base_estimator.fit(X_train, y_train)
        
        for bin_v in bins:
            indices = np.where(y_test >= bin_v)[0]
            if len(indices) > 0:
                dict_result[f'testing_{month}_{bin_v}'] = mean_absolute_error(y_test[indices], base_estimator.predict(X_test.iloc[indices]))
            else:
                dict_result[f'testing_{month}_{bin_v}'] = "N/A"
    
    df_bins = pd.concat([df_bins, pd.DataFrame.from_dict([dict_result])])

In [None]:
df_bins.head()

In [None]:
df_bins.index = range(df_bins.shape[0])
df_bins.to_csv("../results/stats/detailed_results_m2.csv")

# Method 3 Analysis

In [None]:
training_data = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])
testing_data = pd.read_csv("../datasets/generated/2018_z2_Floor6.csv", index_col=[0])

In [None]:
training_data.index = pd.to_datetime(training_data.index)
training_data.head()

In [None]:
unique_frames = np.unique(training_data.frame_id.values)
unique_months = np.unique(training_data.index.month)

In [None]:
print(unique_months)

In [None]:
training_data['output'] = training_data[training_data.columns[0]]

In [None]:
from tqdm.notebook import tqdm

In [None]:
def lag_based_FE(df, lag_length, future, step):
    new_df = df.copy()
    for feature in new_df.columns:
        if feature != "output":
            for i in range(1, lag_length, step):
                new_df.loc[:, f'{feature}_{i}'] = new_df.loc[:, feature].shift(periods = i)
            new_df.drop(columns = [feature], inplace=True)
        
    new_df.loc[:, 'output'] = new_df.loc[:, 'output'].shift(periods = -future)
    new_df.dropna(inplace=True)
    return new_df

In [None]:
def produce_df_frames(all_df, lag_length, future, step, frames):
    df = pd.DataFrame()
    for frame in tqdm(frames):
        df_frame = all_df.loc[all_df.frame_id == frame, :]
        df_frame.drop(columns = ['frame_id'], inplace=True)
        lagged_frame = lag_based_FE(df_frame, lag_length, future, step)
        df = pd.concat([df, lagged_frame])
    return df

In [None]:
HPO_algorithms = {
    5: GradientBoostingRegressor(max_depth = 3, min_samples_split = 10),
    10: GradientBoostingRegressor(max_depth = 10, min_samples_split = 10),
    15: GradientBoostingRegressor(max_depth = 10, min_samples_split = 10),
    20: GradientBoostingRegressor(max_depth = 10, min_samples_split = 5)
}

In [None]:
_, bins = np.histogram(training_data.output.values, bins = 5)
print(bins)

In [None]:
import time
from sklearn.metrics import mean_absolute_error


_, bins = np.histogram(training_data.output.values, bins = 5)

set_hs_fs = [[5, 5], [10, 10],[15, 15],[20, 20]]
alg_best_months = pd.DataFrame()


for h_f in tqdm(set_hs_fs):
    dict_result = {}
    h, f = h_f[0], h_f[1]
    dict_result['history'], dict_result['future'] = h, f
    
    for month in unique_months:
        month_data = training_data.loc[training_data.index.month == month, :]
        nonmonth_data = training_data.loc[training_data.index.month != month, :]
        
        training_frames, validation_frames = np.unique(nonmonth_data.frame_id.values), np.unique(month_data.frame_id.values)
        
        start_time = time.time()
        training_set, validation_set = produce_df_frames(nonmonth_data, h, f, 1, training_frames), produce_df_frames(month_data, h, f, 1, validation_frames)
        end_time = time.time()
        
        total_seq_time = round((end_time - start_time) /60, 2)
        dict_result['preprocess_time'] = total_seq_time
        
        scaler = StandardScaler()
        scaled_train_df = pd.DataFrame(np.c_[training_set.output.values, scaler.fit_transform(training_set.drop(columns = ['output']))], columns = training_set.columns)
        scaled_validation_df = pd.DataFrame(np.c_[validation_set.output.values, scaler.transform(validation_set.drop(columns = ['output']))], columns = training_set.columns)
        
        base_estimator = HPO_algorithms[h]
        X_train, y_train = scaled_train_df.drop(columns  =['output']), scaled_train_df.output.values
        X_validation, y_validation = scaled_validation_df.drop(columns  =['output']), scaled_validation_df.output.values
        
        start_time = time.time()
        base_estimator.fit(X_train, y_train)
        end_time = time.time()
        
        feature_imp_df = pd.DataFrame(base_estimator.feature_importances_, index = X_train.columns, columns = ['importance'])
        
        total_train_time = round((end_time - start_time) /60, 2)
        dict_result['total_train_time'] = total_train_time
        
#         y_train_predictions = base_estimator.predict(X_train)
#         y_validation_predictions = base_estimator.predict(X_validation)
        
        for idx, bin_v in enumerate(bins):
            indices = np.where(y_validation >= bin_v)[0]
            if len(indices) > 0:
                dict_result[f'testing-{month}_bin-{idx}'] = mean_absolute_error(y_validation[indices], base_estimator.predict(X_validation.iloc[indices]))
            else:
                dict_result[f'testing-{month}_bin-{idx}'] = "N/A"
                
        feature_imp_df.to_csv(f"../results/stats/stats_2019_z2_h-{h}_Floor6_m3_month-{month}_feature_imp.csv")
                
    alg_best_months = pd.concat([alg_best_months, pd.DataFrame.from_dict([dict_result])])
    alg_best_months.index = range(alg_best_months.shape[0])
alg_best_months.to_csv("../results/stats/stats_2019_z2_Floor6_m3_months_validation.csv")

In [None]:
alg_best_months.to_csv("../results/stats/stats_2019_z2_Floor6_m3_months_validation.csv")

In [None]:
removed_frame_data = training_data.copy()
removed_frame_data.drop(columns = ['frame_id'], inplace=True)

In [None]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(np.c_[scaler.fit_transform(removed_frame_data.drop(columns = ['output'])), removed_frame_data.output.values], columns = removed_frame_data.columns)

In [None]:
training_data.index = pd.to_datetime(training_data.index)

In [None]:
print(training_data.shape, scaled_df.shape, len(training_data.index))

In [None]:
unique_months = np.unique(training_data.index.month)

In [None]:
print(bins)

In [None]:
set_hs_fs = [[5, 5], [10, 10],[15, 15],[20, 20]]
df_bins = pd.DataFrame()

for h_f in tqdm(set_hs_fs):
    dict_result = {}
    h, f = h_f[0], h_f[1]
    dict_result['history'], dict_result['future'] = h, f
    
    for month in tqdm(unique_months[1:]):
        scaled_df_train, scaled_df_test = scaled_df.loc[training_data.index.month == month, :], scaled_df.loc[training_data.index.month != month, :]
        shifted_df_train, shifted_df_test  = lag_based_FE(scaled_df_train, h, f, 1), lag_based_FE(scaled_df_test, h, f, 1)
        
        X_train, y_train = shifted_df_train.drop(columns = ['output']), shifted_df_train.output.values
        X_test, y_test = shifted_df_test.drop(columns = ['output']), shifted_df_test.output.values
        
        base_estimator = GradientBoostingRegressor(max_depth = 10, min_samples_split = 2)
        base_estimator.fit(X_train, y_train)
        
        for bin_v in bins:
            indices = np.where(y_test >= bin_v)[0]
            if len(indices) > 0:
                dict_result[f'testing_{month}_{bin_v}'] = mean_absolute_error(y_test[indices], base_estimator.predict(X_test.iloc[indices]))
            else:
                dict_result[f'testing_{month}_{bin_v}'] = "N/A"
    
    df_bins = pd.concat([df_bins, pd.DataFrame.from_dict([dict_result])])

In [None]:
df_bins.index = range(df_bins.shape[0])
df_bins.to_csv("../results/stats/detailed_results_m3.csv")

# 1D-CNN Analysis

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import trange, tqdm

In [None]:
def create_model(set_conv, dense_layers, n_timesteps,n_features, activation, lr):
    model = Sequential()
    model.add(Conv1D(filters=set_conv[0], kernel_size=3, activation=activation, input_shape=(n_timesteps,n_features)))

    for conv_filter in set_conv[1:]:
        model.add(Conv1D(filters=conv_filter, kernel_size=3, activation=activation))
    model.add(Dropout(0.5))
    model.add(Flatten())
    
    opt = keras.optimizers.Adam(learning_rate = lr)
    for l in dense_layers:
        model.add(Dense(l, activation))
    model.add(Dense(1, activation))
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mean_absolute_error'], run_eagerly=True)
    
    return model


def split_sequences(sequences, n_steps_in, out_steps, output_var):
    X, y= list(), list()
    for i in np.arange(start=0, stop=len(sequences)):
        # find the end of this pattern
        end_ix = i + (n_steps_in-1)
        out_idx = end_ix + out_steps
        if out_idx >= len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x = sequences.loc[i:end_ix, :]
        seq_x.drop(columns = output_var, inplace=True)
        seq_x = seq_x.values
        seq_y = sequences.loc[out_idx, output_var]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
training_data = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])

In [None]:
training_data.index = pd.to_datetime(training_data.index)

In [None]:
training_data['output'] = training_data['z2_AC1(kW)']

In [None]:
def get_framewise_seq(sample_df, h, f, output_var):
    unique_frames = np.unique(sample_df.frame_id)
    first_frame = sample_df[sample_df.frame_id == unique_frames[0]]
    first_frame.index = range(first_frame.shape[0])
    
    first_frame.drop(columns = ['frame_id'], inplace=True)

    features, outputs = split_sequences(first_frame, h, f, output_var)
#     print(f"features: {features.shape}, outputs: {outputs.shape}")
    for i in tqdm(range(1, len(unique_frames))):
        frame_df = sample_df[sample_df.frame_id == f"frame_{i}"]
        frame_df.index = range(frame_df.shape[0])
#         print('frame_df', frame_df.shape)
        frame_df.drop(columns = ['frame_id'], inplace=True)
        frame_X, frame_y = split_sequences(frame_df, h, f, output_var)
#         print(f"frame_X: {frame_X.shape}, frame_y: {frame_y.shape}")
        if frame_X.shape[0] != 0:
            features, outputs = np.concatenate((features, frame_X)), np.concatenate((outputs, frame_y))
        
    return np.asarray(features).astype(np.float32), np.asarray(outputs).astype(np.float32)

In [None]:
optimal_values = {
    5: {'conv': [32], 'dense': [128]},
    10: {'conv': [128], 'dense': [32, 64, 128]},
    15: {'conv': [32, 64], 'dense': [32, 64]},
    20: {'conv': [32, 64], 'dense': [32, 64, 128]},
}

In [None]:
print(np.unique(training_data.index.month))

In [None]:
import time

set_hs_fs = [[5, 5], [10, 10],[15, 15],[20, 20]]
df_bins = pd.DataFrame()

for h_f in tqdm(set_hs_fs):
    dict_result = {}
    h, f = h_f[0], h_f[1]
    dict_result['history'], dict_result['future'] = h, f
    
    for month in tqdm(unique_months[1:]):
        month_data = training_data.loc[training_data.index.month == month, :]
        nonmonth_data = training_data.loc[training_data.index.month != month, :]
        
#         print(month_data.shape, nonmonth_data.shape, month)
#         month_data = month_data.sample(n=20000)
#         nonmonth_data = nonmonth_data.sample(n = 40000)
        
        month_data.index = range(month_data.shape[0])
        nonmonth_data.index = range(nonmonth_data.shape[0])
        
        
        scaler = StandardScaler()
        scaled_df_train = pd.DataFrame(np.c_[scaler.fit_transform(nonmonth_data.drop(columns = ['frame_id', 'output'])), nonmonth_data.frame_id.values, nonmonth_data.output.values], columns = nonmonth_data.columns)
        scaled_df_test = pd.DataFrame(np.c_[scaler.transform(month_data.drop(columns = ['frame_id', 'output'])), month_data.frame_id.values, month_data.output.values], columns = month_data.columns)
        output_var = ['output']
        
        X_train, y_train = get_framewise_seq(scaled_df_train, h, f, output_var)
        X_test, y_test = get_framewise_seq(scaled_df_train, h, f, output_var)
        
        hpo_values = optimal_values[h]
        conv_values, dense_values = hpo_values['conv'], hpo_values['dense'] 
        
        model = create_model(conv_values, dense_values, h,len(nonmonth_data.columns) - 2, 'linear', 1e-3)
        print(X_train.shape, y_train.shape)
        str_conv = "_".join([str(conv) for conv in conv_values])
        str_dense = "_".join([str(conv) for conv in dense_values])

        dict_result['conv'] = str_conv
        dict_result['dense'] = str_dense
        start_time = time.time()
        model.fit(X_train, y_train, verbose=0, batch_size=32, epochs = 5)
        end_time = time.time()

        total_train_time = round((end_time - start_time) / 60, 2)
        dict_result['total_train_time'] = total_train_time
        
        for bin_v in bins:
            indices = np.where(y_test >= bin_v)[0]
            if len(indices) > 0:
                dict_result[f'testing_{month}_{bin_v}'] = mean_absolute_error(y_test[indices], model.predict(X_test[indices]))
            else:
                dict_result[f'testing_{month}_{bin_v}'] = "N/A"
    
    df_bins = pd.concat([df_bins, pd.DataFrame.from_dict([dict_result])])

In [None]:
df_bins.index = range(df_bins.shape[0])
df_bins.to_csv("../results/stats/detailed_results_m1.csv")