In [11]:
import pandas as pd

df = pd.read_csv("AA.Complete Data.csv")
df = df.drop(columns = ["MW Demand"])
df["DATE & TIME"] = pd.to_datetime(df["DATE & TIME"])
df.head()
print(df.columns)

Index(['DATE & TIME', 'Solar Radiation (Wh/m)2', 'UV Intensity',
       'Atmospheric Clarity', 'Temperature (2m)', 'Humidity (2M)', 'PS',
       'Wind Speed (10m)', 'Wind Direction (10m)', 'MW Supply'],
      dtype='object')


## Create Lag Features <br>
As decision trees can not understand time real data, I need to explicit tell it past values by creating new features with past data <br>
I am creating 24 lags per feature so the tree can "look back" at the past 24 hours of data


In [14]:
def create_lag_features(data, columns, lags=24):
    lagged_dfs = []  # list to store lagged DataFrames

    for column in columns:
        for lag in range(1, lags + 1):
            lagged_col = data[column].shift(lag)
            lagged_col.name = f"{column}_lag{lag}"
            lagged_dfs.append(lagged_col)

    # Combine all lagged columns into one DataFrame
    # Helps prevent issues with pandas performance
    lagged_features = pd.concat(lagged_dfs, axis=1)

    # Combine with original data and drop NaNs caused by shifting
    data = pd.concat([data, lagged_features], axis=1).dropna()

    return data

# Specify the columns to generate lags for (not including DATE & TIME)
columns_to_lag = ['Solar Radiation (Wh/m)2', 'UV Intensity', 'Atmospheric Clarity',
                  'Temperature (2m)', 'Humidity (2M)', 'PS', 'Wind Speed (10m)',
                  'Wind Direction (10m)', 'MW Supply']

# Create lag features for the past 24 hours
df_lagged = create_lag_features(df, columns=columns_to_lag, lags=24)

# Now I can prepare the data for training
X = df_lagged.drop(['DATE & TIME', 'MW Supply'], axis=1)  # Drop 'DATE & TIME' and target variable 'MW Supply'
y = df_lagged['MW Supply']  # Target variable (MW Supply)


## Chronological Splitting The Data

In [16]:
n = len(X)

# 80% of the data for training
train_end = int(n * 0.8)

# 10% of the data for validation (next 10%)
val_end = int(n * 0.9)

# Training set: first 80%
X_train = X.iloc[:train_end]
y_train = y.iloc[:train_end]

# Validation set: next 10%
X_val = X.iloc[train_end:val_end]
y_val = y.iloc[train_end:val_end]

# Test set: final 10%
X_test = X.iloc[val_end:]
y_test = y.iloc[val_end:]