In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
from sklearn.metrics import r2_score


# Method 3 Analysis

In [None]:
training_data = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])
z1_data = pd.read_csv("../datasets/generated/2019_z1_Floor6.csv", index_col=[0])

In [None]:
training_data.index = pd.to_datetime(training_data.index)
z1_data.index = pd.to_datetime(z1_data.index)

In [None]:
z1_data.head()

In [None]:
training_data['output'] = training_data['z2_AC1(kW)']
z1_data['output'] = z1_data['z1_AC1(kW)']

unique_frames = np.unique(training_data.frame_id.values)

In [None]:
z1_data.head()

In [None]:
unique_frames_z1 = np.unique(z1_data.frame_id.values)

In [None]:
def produce_df_frames(all_df, lag_length, future, step, frames):
    df = pd.DataFrame()
    for frame in tqdm(frames):
        df_frame = all_df.loc[all_df.frame_id == frame, :]
        df_frame.drop(columns = ['frame_id'], inplace=True)
        lagged_frame = lag_based_FE(df_frame, lag_length, future, step)
        df = pd.concat([df, lagged_frame])
    return df
def lag_based_FE(df, lag_length, future, step):
    new_df = df.copy()
    for feature in new_df.columns:
        if feature != "output":
            for i in range(1, lag_length, step):
                new_df.loc[:, f'{feature}_{i}'] = new_df.loc[:, feature].shift(periods = i)
            new_df.drop(columns = [feature], inplace=True)
        
    new_df.loc[:, 'output'] = new_df.loc[:, 'output'].shift(periods = -future)
    new_df.dropna(inplace=True)
    return new_df

In [None]:
HPO_algorithms = {
    5: GradientBoostingRegressor(max_depth = 3, min_samples_split = 10),
    10: GradientBoostingRegressor(max_depth = 10, min_samples_split = 10),
    15: GradientBoostingRegressor(max_depth = 10, min_samples_split = 10),
    20: GradientBoostingRegressor(max_depth = 10, min_samples_split = 5)
}

In [None]:
import time

_, bins = np.histogram(training_data.output.values, bins = 5)

# set_hs_fs = [[5, 5], [10, 10],[15, 15],[20, 20]]
# set_hs_fs = [[5, 5],[20, 20]]
set_hs_fs = [[10, 10]]


alg_best_months = pd.DataFrame()


for h_f in tqdm(set_hs_fs):
    dict_result = {}
    h, f = h_f[0], h_f[1]
    
    for month in [3, 10]:
        month_data = training_data.loc[training_data.index.month == month, :]
        nonmonth_data = training_data.loc[training_data.index.month != month, :]
        
        month_data_z1 = z1_data.loc[z1_data.index.month == month, :]
#         intersection_indices = np.intersect1d(month_data.index, month_data_z1.index, return_indices=True)
        
#         month_data_z1 = month_data_z1.loc[intersection_indices[0], :]
#         month_data = month_data.loc[intersection_indices[0], :]
#         print(month_data.shape, month_data_z1.shape)
        
        training_frames, validation_frames = np.unique(nonmonth_data.frame_id.values), np.unique(month_data.frame_id.values)
        z1_frames = np.unique(month_data_z1.frame_id.values)
        
        start_time = time.time()
        training_set, validation_set = produce_df_frames(nonmonth_data, h, f, 1, training_frames), produce_df_frames(month_data, h, f, 1, validation_frames)
        end_time = time.time()
        
        z1_frame_data  = produce_df_frames(month_data_z1, h, f, 1, z1_frames)
        
        intersection_indices = np.intersect1d(validation_set.index, z1_frame_data.index, return_indices=True)
        
        month_data_z1 = month_data_z1.loc[intersection_indices[0], :]
        validation_set = validation_set.loc[intersection_indices[0], :]

        
        total_seq_time = round((end_time - start_time) /60, 2)
#         dict_result['preprocess_time'] = total_seq_time
        
        scaler = StandardScaler()
        scaled_train_df = pd.DataFrame(np.c_[training_set.output.values, scaler.fit_transform(training_set.drop(columns = ['output']))], columns = training_set.columns)
        scaled_validation_df = pd.DataFrame(np.c_[validation_set.output.values, scaler.transform(validation_set.drop(columns = ['output']))], columns = training_set.columns)
        
        
        base_estimator = HPO_algorithms[h]
        X_train, y_train = scaled_train_df.drop(columns  =['output']), scaled_train_df.output.values
        X_validation, y_validation = scaled_validation_df.drop(columns  =['output']), scaled_validation_df.output.values
        
        start_time = time.time()
        base_estimator.fit(X_train, y_train)
        end_time = time.time()
        total_train_time = round((end_time - start_time) /60, 2)
        dict_result['total_train_time'] = total_train_time
        predictions = base_estimator.predict(X_validation)
        
        for col in month_data_z1.columns[:-2]:
            dict_result['month'] = month
            dict_result['history'], dict_result['future'] = h, f
            dict_result['parameter'] = col
            
            col_values = month_data_z1[[col]].values.reshape(-1,)
            loss = (y_validation - predictions)
            ae = abs(y_validation - predictions)
            corr_1, _ = pearsonr(col_values,y_validation)
            corr_loss, _ = pearsonr(col_values,loss)
            corr_ae, _ = pearsonr(col_values,ae)
            
            dict_result['corr_orig'] = corr_1
            dict_result['corr_loss'] = corr_loss
            dict_result['corr_ae'] = corr_ae
            
            dict_result['r2_score'] = r2_score(y_validation, predictions)
            
            alg_best_months = pd.concat([alg_best_months, pd.DataFrame.from_dict([dict_result])])
    alg_best_months.index = range(alg_best_months.shape[0])
alg_best_months.to_csv("../results/stats/stats_2019_z2_Floor6_m3_correlation_z1.csv")

In [None]:
alg_best_months.head()