In [None]:
import time
import warnings

import numpy as np
import pickle as pk
import pandas as pd
import lightgbm as lgb

from sklearn.multioutput import MultiOutputRegressor


In [None]:
warnings.simplefilter('ignore')

def get_train_data(path_to_bu, path_to_weather):
    
    print("Loading train data")

    df_bu, df_we = pd.read_csv(path_to_bu), pd.read_csv(path_to_weather)
    df = pd.merge(df_bu, df_we, left_index=True, right_index=True)

    cols = ['Month', 'Hour', 'Day_Type', 'Daylight_Savings_Status', 'Indoor_Temperature',
            'Average_Unmet_Cooling_Setpoint_Difference',
            'Indoor_Relative_Humidity',
            'Equipment_Electric_Power',
            'DHW_Heating',
            'Cooling_Load',
            'Heating_Load',
            'Solar_Generation',
            'Outdoor_Drybulb_Temperature',
            'Relative_Humidity',
            'Diffuse_Solar_Radiation',
            'Direct_Solar_Radiation',
            '6h_Prediction_Outdoor_Drybulb_Temperature',
            '12h_Prediction_Outdoor_Drybulb_Temperature',
            '24h_Prediction_Outdoor_Drybulb_Temperature',
            '6h_Prediction_Relative_Humidity',
            '12h_Prediction_Relative_Humidity',
            '24h_Prediction_Relative_Humidity',
            '6h_Prediction_Diffuse_Solar_Radiation',
            '12h_Prediction_Diffuse_Solar_Radiation',
            '24h_Prediction_Diffuse_Solar_Radiation',
            '6h_Prediction_Direct_Solar_Radiation',
            '12h_Prediction_Direct_Solar_Radiation',
            '24h_Prediction_Direct_Solar_Radiation',
            ]
    df.columns = cols

    selected_cols = ['Month', 'Hour', 'Day_Type',
            'Equipment_Electric_Power',
            'Solar_Generation',
            'Outdoor_Drybulb_Temperature',
            'Relative_Humidity',
            'Diffuse_Solar_Radiation',
            'Direct_Solar_Radiation',
            ]
    df = df[selected_cols]

    df['Solar_Generation'] = df['Solar_Generation'] / 1000.  # * pv_cap / 1000.0
    df['Day'] = df.index // 24
    df['Hour'] = df.Hour % 24
    N = 24
    for i in range(N):
        df['Outdoor_Drybulb_Temperature_{}'.format(i)] = df['Outdoor_Drybulb_Temperature'].shift(-i - 1)
        df['Relative_Humidity_{}'.format(i)] = df['Relative_Humidity'].shift(-i - 1)
        df['Diffuse_Solar_Radiation_{}'.format(i)] = df['Diffuse_Solar_Radiation'].shift(-i - 1)
        df['Direct_Solar_Radiation_{}'.format(i)] = df['Direct_Solar_Radiation'].shift(-i - 1)
    for i in range(N * 7):
        df['Load_Past_{}'.format(i)] = df['Equipment_Electric_Power'].shift(i + 1)
    for i in range(N):
        df['Equipment_Electric_Power_{}'.format(i)] = df['Equipment_Electric_Power'].shift(-i - 1)
    df_drop = df.dropna(inplace=False)

    targets = [item for item in df_drop.columns if 'Equipment_Electric_Power_' in item]
    x_train = df_drop.drop(targets, axis=1)
    y_train = df_drop[targets]

    print("Loading train data finish")
    return x_train, y_train

In [None]:
path_to_bu1 = "data/opt_data/Building_1.csv"
path_to_bu2 = "data/opt_data/Building_2.csv"
path_to_bu3 = "data/opt_data/Building_3.csv"
path_to_bu4 = "data/opt_data/Building_4.csv"
path_to_bu5 = "data/opt_data/Building_5.csv"

path_to_bu = [path_to_bu1, path_to_bu2, path_to_bu3, path_to_bu4, path_to_bu5]
path_to_weather = "data/opt_data/weather.csv"

x_train, y_train = pd.DataFrame(), pd.DataFrame()
for s in range(5):
    x, y = get_train_data(path_to_bu[s], path_to_weather)
    x_train = pd.concat([x_train, x], ignore_index=True)
    y_train = pd.concat([y_train, y], ignore_index=True)
targets = [item for item in y_train.columns if 'Equipment_Electric_Power_' in item]

start_time = time.time()
print("------------------train---------------------------")
gbm = MultiOutputRegressor(
    lgb.LGBMRegressor(objective='regression', n_estimators=750, learning_rate=0.05, num_leaves=16, random_state=0,
                        verbose=-1, n_jobs=1), n_jobs=24)
gbm.fit(x_train, y_train)
print('Training time: {}'.format(time.time() - start_time))

# Save to file in the current working directory
pkl_filename = "pred.pkl"
with open(pkl_filename, 'wb') as file:
    pk.dump(gbm, file)

pred_train = pd.DataFrame(gbm.predict(x_train), columns=targets)
pred_train.index = y_train.index
print('Building 1-5 WMAPE:')
print((y_train - pred_train).abs().mean() / y_train.abs().mean())

In [None]:
import matplotlib.pyplot as plt

t = 53

# Plot real data vs prediction
plt.figure(figsize=(15, 6))
plt.plot(y_train.values.flatten()[t:t + 48], label='Real Data')
plt.plot(pred_train.values.flatten()[t:t + 48], label='Predicted Data')
plt.legend()
plt.title('Real Data vs Predicted Data')
plt.xlabel('Sample Index')
plt.ylabel('Equipment Electric Power')
plt.show()

In [None]:
for c in x_train.columns:
    print(c)

In [None]:
path_to_bu1 = "data/ref_data/Building_1.csv"
path_to_bu2 = "data/ref_data/Building_2.csv"
path_to_bu3 = "data/ref_data/Building_3.csv"
path_to_bu4 = "data/ref_data/Building_4.csv"
path_to_bu5 = "data/ref_data/Building_5.csv"

path_to_bu = [path_to_bu1, path_to_bu2, path_to_bu3, path_to_bu4, path_to_bu5]
path_to_weather = "data/ref_data/weather.csv"

x_train, y_train = pd.DataFrame(), pd.DataFrame()
for s in range(5):
    x, y = get_train_data(path_to_bu[s], path_to_weather)
    x_train = pd.concat([x_train, x], ignore_index=True)
    y_train = pd.concat([y_train, y], ignore_index=True)
targets = [item for item in y_train.columns if 'Equipment_Electric_Power_' in item]

start_time = time.time()
print("------------------train---------------------------")
gbm = MultiOutputRegressor(
    lgb.LGBMRegressor(objective='regression', n_estimators=750, learning_rate=0.01, num_leaves=16, random_state=2022,
                        verbose=-1, n_jobs=1), n_jobs=24)
gbm.fit(x_train, y_train)
print('Training time: {}'.format(time.time() - start_time))

# Save to file in the current working directory
pkl_filename = "pred.pkl"
with open(pkl_filename, 'wb') as file:
    pk.dump(gbm, file)

pred_train = pd.DataFrame(gbm.predict(x_train), columns=targets)
pred_train.index = y_train.index
print('Building 1-5 WMAPE:')
print((y_train - pred_train).abs().mean() / y_train.abs().mean())

In [None]:
from sklearn.impute import SimpleImputer

imputer_x = SimpleImputer(strategy='mean')
imputer_y = SimpleImputer(strategy='mean')

def get_train_data(path_to_bu, path_to_weather):
    
    print("Loading train data")

    df_bu, df_we = pd.read_csv(path_to_bu), pd.read_csv(path_to_weather)
    df = pd.merge(df_bu, df_we, left_index=True, right_index=True)

    cols = [
        'Month', 'Hour', 'Day_Type', 'Daylight_Savings_Status', 'Indoor_Temperature',
        'Average_Unmet_Cooling_Setpoint_Difference',
        'Indoor_Relative_Humidity',
        'Equipment_Electric_Power',
        'DHW_Heating',
        'Cooling_Load',
        'Heating_Load',
        'Solar_Generation',
        'Outdoor_Drybulb_Temperature',
        'Relative_Humidity',
        'Diffuse_Solar_Radiation',
        'Direct_Solar_Radiation',
        '6h_Prediction_Outdoor_Drybulb_Temperature',
        '12h_Prediction_Outdoor_Drybulb_Temperature',
        '24h_Prediction_Outdoor_Drybulb_Temperature',
        '6h_Prediction_Relative_Humidity',
        '12h_Prediction_Relative_Humidity',
        '24h_Prediction_Relative_Humidity',
        '6h_Prediction_Diffuse_Solar_Radiation',
        '12h_Prediction_Diffuse_Solar_Radiation',
        '24h_Prediction_Diffuse_Solar_Radiation',
        '6h_Prediction_Direct_Solar_Radiation',
        '12h_Prediction_Direct_Solar_Radiation',
        '24h_Prediction_Direct_Solar_Radiation',
    ]
    
    df.columns = cols

    fixed_cols = ['Month', 'Hour', 'Day_Type']

    selected_cols = [
        'Equipment_Electric_Power',
        'Solar_Generation',
        'Outdoor_Drybulb_Temperature',
        'Relative_Humidity',
        'Diffuse_Solar_Radiation',
        'Direct_Solar_Radiation',
        '6h_Prediction_Outdoor_Drybulb_Temperature',
        '12h_Prediction_Outdoor_Drybulb_Temperature',
        '24h_Prediction_Outdoor_Drybulb_Temperature',
        '6h_Prediction_Relative_Humidity',
        '12h_Prediction_Relative_Humidity',
        '24h_Prediction_Relative_Humidity',
        '6h_Prediction_Diffuse_Solar_Radiation',
        '12h_Prediction_Diffuse_Solar_Radiation',
        '24h_Prediction_Diffuse_Solar_Radiation',
        '6h_Prediction_Direct_Solar_Radiation',
        '12h_Prediction_Direct_Solar_Radiation',
        '24h_Prediction_Direct_Solar_Radiation',
    ]

    df = df[fixed_cols + selected_cols]

    df['Solar_Generation'] = df['Solar_Generation'] / 1000.  # * pv_cap / 1000.0
    df['Day'] = df.index // 24
    df['Hour'] = df.Hour % 24

    x_train = pd.merge(
        df, 
        df[['Equipment_Electric_Power', 'Outdoor_Drybulb_Temperature', 'Relative_Humidity', 'Diffuse_Solar_Radiation', 'Direct_Solar_Radiation']].shift(range(1,12)),
        left_index=True, right_index=True
    ) # Add 
    y_train = df['Equipment_Electric_Power'].shift(list(range(-4,0)))
    
    # Use the imputter to fill the missing values

    x_train = pd.DataFrame(imputer_x.fit_transform(x_train), columns=x_train.columns)
    y_train = pd.DataFrame(imputer_y.fit_transform(y_train), columns=y_train.columns)

    print("Loading train data finish")
    return x_train, y_train

In [None]:
from sklearn.model_selection import GridSearchCV

def train_model_with_gridsearch(path_to_bu, path_to_weather, param_grid):
    x_train, y_train = pd.DataFrame(), pd.DataFrame()
    for i in range(5):
        x, y = get_train_data(path_to_bu[i], path_to_weather)
        x_train = pd.concat([x_train, x], ignore_index=True)
        y_train = pd.concat([y_train, y], ignore_index=True)
    targets = [item for item in y_train.columns if 'Equipment_Electric_Power_' in item]

    gbm = MultiOutputRegressor(
        lgb.LGBMRegressor(objective='regression', random_state=0, verbose=-1, n_jobs=1), n_jobs=24
    )

    grid_search = GridSearchCV(gbm, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
    grid_search.fit(x_train, y_train)

    best_model = grid_search.best_estimator_

    # Save to file in the current working directory
    pkl_filename = "best_model.pkl"
    with open(pkl_filename, 'wb') as file:
        pk.dump(best_model, file)

    pred_train = pd.DataFrame(best_model.predict(x_train), columns=targets)
    pred_train.index = y_train.index
    print('Building 1-5 WMAPE:')
    print((y_train - pred_train).abs().mean() / y_train.abs().mean())

    return best_model, grid_search.best_params_, y_train, pred_train

# Example usage
path_to_bu1 = "data/opt_data/Building_1.csv"
path_to_bu2 = "data/opt_data/Building_2.csv"
path_to_bu3 = "data/opt_data/Building_3.csv"
path_to_bu4 = "data/opt_data/Building_4.csv"
path_to_bu5 = "data/opt_data/Building_5.csv"

path_to_bu = [path_to_bu1, path_to_bu2, path_to_bu3, path_to_bu4, path_to_bu5]
path_to_weather = "data/opt_data/weather.csv"

# param_grid = {
#     'estimator__n_estimators': [50, 100, 200, 300, 400, 500, 750, 1000],
#     'estimator__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
#     'estimator__num_leaves': [16, 32, 64, 128]
# }

param_grid = {
    'estimator__n_estimators': [500],
    'estimator__learning_rate': [0.01],
    'estimator__num_leaves': [16]
}

best_model, best_params, y_train, pred_train = train_model_with_gridsearch(path_to_bu, path_to_weather, param_grid=param_grid)
print("Best parameters found: ", best_params)

In [None]:
import matplotlib.pyplot as plt

t = 53

# Plot real data vs prediction
plt.figure(figsize=(15, 6))
plt.plot(y_train.values.flatten()[t:t + 48], label='Real Data')
plt.plot(pred_train.values.flatten()[t:t + 48], label='Predicted Data')
plt.legend()
plt.title('Real Data vs Predicted Data')
plt.xlabel('Sample Index')
plt.ylabel('Equipment Electric Power')
plt.show()

In [None]:
test_df = pd.DataFrame(np.random.randn(100, 1))

test_df.columns = [f'c_{c}' for c in range(1)]

test_df.head()

In [None]:
l = list(range(-2,0)) + list(range(1,3))

print(l)

a = test_df.shift(l).head() # - is future, + is past

In [None]:
def shift_and_append(df, n):

    # Shift the DataFrame by n positions (downward)
    shifted_df = df.shift(n)
    
    # Collect the dropped values (the last n rows before the shift)
    dropped_values = df.iloc[:-n].reset_index(drop=True)
    
    # Replace the first n rows (now NaN) with the dropped values
    shifted_df.iloc[n:] = dropped_values.values
    
    return shifted_df

In [None]:
a = {
    'a': [1,2,3,4,5],
    'b': [6,7,8,9,10]
}

b = {
    **a,
    'a': [11,12,13,14,15],
}

a,b

In [None]:

a = test_df.shift(l)

# Replace nan values with the values we deplaced

c = pd.DataFrame()
c_noise = pd.DataFrame()
noise = np.random.normal(0, 0.01, (100))

for i, s in enumerate(l):
    
    sdf = shift_and_append(test_df['c_0'], s)
    sdf_noise = sdf + noise

    c = pd.concat([c, sdf], axis=1)
    c_noise = pd.concat([c_noise, sdf_noise], axis=1)

    # c['c_0'] = c['c_0'] #+ noise

    noise += np.random.normal(0, 0.01, (100))

# Rename the columns to reflect the shift

c.columns = [f'c_{abs(c)}h' for c in l]

# c.tail(10), test_df.head(10), a.tail(10)
def shift_and_concat(df, shifts):
    c = pd.DataFrame()
    for s in shifts:
        shifted_df = shift_and_append(df.iloc[:, 0], s)
        c = pd.concat([c, shifted_df], axis=1)
    c.columns = [f'{df.columns[0]}_{abs(s)}h' for s in shifts]
    return c

shifted_df = shift_and_concat(test_df, l)
shifted_df.tail(10), test_df.head(10), a.tail(10)

In [None]:
l = [-4, -6, -12, -24]

def shift_and_concat(df, shifts):

    c = pd.DataFrame()
    noise = np.random.normal(0, 0.01, (df.shape[0]))
    
    for s in shifts:
        shifted_df = shift_and_append(df, s)
        shifted_df = shifted_df + noise
        c = pd.concat([c, shifted_df], axis=1)
        noise += np.random.normal(0, 0.01, (df.shape[0]))
    c.columns = [f'{df.name}_{abs(s)}h' for s in shifts]
    return c

shifted_df = shift_and_concat(x_train['Equipment_Electric_Power'], l)

In [None]:
shifted_df.tail()

In [None]:
(c.values - c_noise.values)