# Prep

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
def get_np_array(df, columns):
    arr = np.stack(df[columns[0]].to_numpy())
    for c in range(1,len(columns)):
        arr = np.concatenate([arr,np.stack(df[columns[c]].to_numpy())],1)
    return arr

# Load sites A,B,C,D

In [3]:
df = pd.read_csv("./data/sites_ABCD_NewFeatures.csv", index_col=0)
df['date'] = pd.to_datetime(df['date'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Array aggregate to daily level

working_hours = [10,11,12,13,14,15,16,17]

df = df[(df["hour"].isin(working_hours))].groupby(["site", "date"]).aggregate(list).sort_index().reset_index()

minf = df['demand_response'].apply(min)
maxf = df['demand_response'].apply(max)
df["DayResponse"] = np.where(
    (minf==0) & (maxf==1),
    1,
    np.where(
        (minf==-1) & (maxf==0),
        -1,
        np.where(
            (minf==-1) & (maxf==1),
            2,
            0
        )
        )
    )


df['DayResponse'].value_counts()

len(df) # Number of days in the dataset (365 days * 4 sites)

1460

# Train classifier on A,B,C,D

In [4]:
good_features = ['temp_corr_dev', 'power_zscore_sh', 'power_zscore_sh_diff_t', 'power_zscore_sh_diff_wdt', 'power_zscore_sh_peek_diff', 'power_zscore_sh_diff', 'power_zscore_sh_peek_diff_t', 'power_zscore_sh_hourly_std', 'power_share_zscore_sh', 'power_share_zscore_sh_diff', 'power_share_zscore_sh_diff_t', 'power_share_zscore_sh_diff_wdt', 'power_share_zscore_sh_peek_diff', 'power_share_zscore_st_hourly_std', 'power_zscore_sh_peek4_diff', 'power_zscore_sh_lag4_diff', 'power_share_zscore_sh_peek4_diff', 'power_share_zscore_sh_lag4_diff', 'power_share_zscore_st_peek4_diff_t', 'season', 'month']
x_tr = get_np_array(df, good_features)
y_tr = df['DayResponse'].to_numpy() +1 # Add 1 so all DayResponse values are non-negative

x_tr.shape, y_tr.shape

((1460, 672), (1460,))

- Train

In [5]:
smote = SMOTE(random_state=94)
x_tr_bal, y_tr_bal = smote.fit_resample(x_tr, y_tr)

xgb = XGBClassifier()
xgb.fit(x_tr_bal,y_tr_bal)

# Load test sites

In [6]:
df = pd.read_csv("./data/test_sites_NewFeatures.csv", index_col=0)
df['date'] = pd.to_datetime(df['date'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Array aggregate to daily level

working_hours = [10,11,12,13,14,15,16,17]

df = df[(df["hour"].isin(working_hours))].groupby(["site", "date"]).aggregate(list).sort_index().reset_index()

len(df) # Number of days in the dataset

3015

# Predict demand response flag for test sites

- Predict one value for each day

In [7]:
x_te = get_np_array(df, good_features)

preds = xgb.predict(x_te)

- Post Process Daily

In [8]:
df['Pred'] = preds-1
df.loc[df["month"].apply(lambda x: x[0]).isin([3,4,5,9,10,11]), 'Pred'] = 0 # No preds in shoulder seasons

print("Day Response predictions:")
df['Pred'].value_counts()

Day Response predictions:


Pred
 0    2599
 2     262
-1      99
 1      55
Name: count, dtype: int64

- Map Daily Preds to 15-Minute Interval Preds

In [9]:
neg1_start_ts = 9
pos1_start_ts = 9
pos2_start_ts = 2

df['Pred_Interval'] = df['Pred'].apply(
    lambda x:
    [0]*(neg1_start_ts-1) + [-1]*(32-neg1_start_ts+1) if x==-1 else
    [0]*32 if x==0  else
    [0]*(pos1_start_ts-1) + [1]*(32-pos1_start_ts+1) if x==1 else
    [0]*(pos2_start_ts-1) + [1]*(8-pos2_start_ts+1) + [-1]*(24)
)

- Expand the DataFrame from daily back to 15-minute intervals

In [10]:
expanded_rows = []
for _, row in df.iterrows():
    row_site = row['site']
    row_ts = row['timestamp']
    row_preds = row['Pred_Interval']
    for ts, pred in zip(row_ts, row_preds):
        expanded_rows.append({'Site':row_site, 'Timestamp_Local': ts, 'Pred_Interval': pred})

expanded_df = pd.DataFrame(expanded_rows)
expanded_df["Timestamp_Local"] = pd.to_datetime(expanded_df["Timestamp_Local"])

len(expanded_df)/32

3015.0

- Join to submission df

In [11]:
submission = pd.read_csv("./data/test_data_v3.csv")
submission["Timestamp_Local"] = pd.to_datetime(submission["Timestamp_Local"])

col_names = ['Site', 'Timestamp_Local', 'Demand_Response_Flag']

submission = pd.merge(submission, expanded_df, how='outer', left_on=["Site", "Timestamp_Local"], right_on=["Site", "Timestamp_Local"])

submission = submission[["Site", "Timestamp_Local", "Pred_Interval"]].fillna(0)

submission.columns = col_names

submission["Demand_Response_Flag"] = submission["Demand_Response_Flag"].astype(int)

submission

Unnamed: 0,Site,Timestamp_Local,Demand_Response_Flag
0,siteA,2020-01-01 00:00:00,0
1,siteA,2020-01-01 00:15:00,0
2,siteA,2020-01-01 00:30:00,0
3,siteA,2020-01-01 00:45:00,0
4,siteA,2020-01-01 01:00:00,0
...,...,...,...
289435,siteM,2020-08-31 22:45:00,0
289436,siteM,2020-08-31 23:00:00,0
289437,siteM,2020-08-31 23:15:00,0
289438,siteM,2020-08-31 23:30:00,0


- Join Flag Preds to D,E,F Interval Features

In [12]:
df_output = pd.read_csv("./data/test_data_v3.csv")
df_output['demand_response'] = submission['Demand_Response_Flag']
df_output.columns = ['site','timestamp','temp','irr','power','demand_response']
df_output[['site','timestamp','temp','irr','power', 'demand_response']].to_csv("./data/test_sites_WithDemandResponsePred.csv")