In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
sample_df = pd.read_csv("../datasets/generated/2019_z2_Floor6.csv", index_col=[0])

In [None]:
unique_frames = np.unique(sample_df.frame_id.values)

In [None]:
# sample_df.drop(columns = ['frame_id'], inplace=True)
sample_df['output'] = sample_df['z2_AC1(kW)']
# sample_df.drop(columns = ['z2_AC1(kW)'], inplace=True)

In [None]:
def lag_based_FE(df, lag_length, future, step):
    new_df = df.copy()
    for feature in new_df.columns:
        if feature != "output":
            for i in range(1, lag_length, step):
                new_df.loc[:, f'{feature}_{i}'] = new_df.loc[:, feature].shift(periods = i)
            new_df.drop(columns = [feature], inplace=True)
        
    new_df.loc[:, 'output'] = new_df.loc[:, 'output'].shift(periods = -future)
    new_df.dropna(inplace=True)
    return new_df

In [None]:
def produce_df_frames(all_df, lag_length, future, step, frames):
    df = pd.DataFrame()
    for frame in tqdm(frames):
        df_frame = all_df.loc[all_df.frame_id == frame, :]
        df_frame.drop(columns = ['frame_id'], inplace=True)
        lagged_frame = lag_based_FE(df_frame, lag_length, future, step)
        df = pd.concat([df, lagged_frame])
    return df

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import trange, tqdm
algorith_parm_map = {
    'rf': {'max_depth': [3, 5, 10],
               'min_samples_split': [2, 5, 10]},
    'gboost': {'max_depth': [3, 5, 10],
               'min_samples_split': [2, 5, 10]},
    'lasso': {'alpha': np.logspace(-4, -0.5, 10)},
    'ridge': {'alpha': np.logspace(-4, -0.5, 10)}
}

set_hs_fs = [[5, 5], [10, 10],[15, 15],[20, 20]]
alg_best = pd.DataFrame()
random_frames = np.random.choice(unique_frames.shape[0], 120)
test_size = 0.7
training_frames, testing_frames = unique_frames[random_frames[:int(test_size*len(random_frames))]], unique_frames[random_frames[int(test_size*len(random_frames)):]]

for h_f in tqdm(set_hs_fs):
    dict_result = {}
    h, f = h_f[0], h_f[1]
    dict_result['history'], dict_result['future'] = h, f
    training_set, validation_set = produce_df_frames(sample_df, h, f, 1, training_frames), produce_df_frames(sample_df, h, f, 1, testing_frames)
    scaler = StandardScaler()
    scaled_train_df = pd.DataFrame(np.c_[training_set.output.values, scaler.fit_transform(training_set.drop(columns = ['output']))], columns = training_set.columns)
    scaled_validation_df = pd.DataFrame(np.c_[validation_set.output.values, scaler.transform(validation_set.drop(columns = ['output']))], columns = training_set.columns)
    
    for alg in tqdm(algorith_parm_map.keys()):

        param_grid = algorith_parm_map[alg]
        base_estimator = None
        if alg == 'rf':
            base_estimator = RandomForestRegressor(random_state = 0)
        elif alg == 'gboost':
            base_estimator = GradientBoostingRegressor(random_state = 0)
        elif alg == 'lasso':
            base_estimator = Lasso(random_state = 0)
        else:
            base_estimator = Ridge(random_state = 0)

        cv_regressor = GridSearchCV(base_estimator, param_grid, scoring = 'neg_mean_absolute_error', cv = 3, verbose=1)
        X_train, y_train = scaled_train_df.drop(columns  =['output']), scaled_train_df.output.values
        X_validation, y_validation = scaled_validation_df.drop(columns  =['output']), scaled_validation_df.output.values

        cv_regressor.fit(X_train, y_train)
        results_df = pd.DataFrame.from_dict(cv_regressor.cv_results_)
        y_predictions = cv_regressor.predict(X_validation)
        dict_result[alg] = mean_absolute_error(y_validation, y_predictions)

        results_df.to_csv(f"../results/stats/2019_z2_Floor6_m3_{alg}_h-{h}_f-{f}_training.csv")
    alg_best = pd.concat([alg_best, pd.DataFrame.from_dict([dict_result])])
alg_best.to_csv("../results/stats/2019_z2_Floor6_m3_validation.csv")

In [None]:
alg_best.head()