In [None]:
pip install holidays

In [None]:
import tensorflow as tf
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanAbsoluteError


In [6]:
import pandas as pd
import numpy as np
import torch
import  matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
import holidays
from sklearn.model_selection import GridSearchCV


In [7]:
train_df = pd.read_csv("train.csv")
gas_df = pd.read_csv("gas_prices.csv")
electricity_df = pd.read_csv("electricity_prices.csv")
client_df = pd.read_csv("client.csv")
fw_df = pd.read_csv("forecast_weather.csv")
hw_df = pd.read_csv("historical_weather.csv")
location = pd.read_csv("county_lon_lats.csv")
location = location.drop(columns = ["Unnamed: 0"])

In [None]:
class FeatureProcessorClass():
    def __init__(self):
        # Columns to join on for the different datasets
        self.weather_join = ['datetime', 'county', 'data_block_id']
        self.gas_join = ['datetime','data_block_id']
        self.electricity_join = ['datetime', 'data_block_id']
        self.client_join = ['county', 'is_business', 'product_type', 'data_block_id']
        self.holiday = ['datetime']
        # Columns of latitude & longitude
        self.lat_lon_columns = ['latitude', 'longitude']

        # Aggregate stats
        self.agg_stats = ['mean'] #, 'min', 'max', 'std', 'median']

        # Categorical columns (specify for XGBoost)
        self.category_columns = ['county', 'is_business', 'product_type', 'is_consumption', 'data_block_id','holiday']

    def create_new_column_names(self, df, suffix, columns_no_change):
        '''Change column names by given suffix, keep columns_no_change, and return back the data'''
        df.columns = [col + suffix
                      if col not in columns_no_change
                      else col
                      for col in df.columns
                      ]
        return df

    def flatten_multi_index_columns(self, df):
        df.columns = ['_'.join([col for col in multi_col if len(col)>0])
                      for multi_col in df.columns]
        return df

    def create_data_features(self, data):
        '''📊Create features for main data (test or train) set📊'''
        # To datetime
        data['datetime'] = pd.to_datetime(data['datetime'])

        # Time period features
        data['date'] = data['datetime'].dt.normalize()
        data['year'] = data['datetime'].dt.year
        data['quarter'] = data['datetime'].dt.quarter
        data['month'] = data['datetime'].dt.month
        data['week'] = data['datetime'].dt.isocalendar().week
        data['hour'] = data['datetime'].dt.hour

        # Day features
        data['day_of_year'] = data['datetime'].dt.day_of_year
        data['day_of_month']  = data['datetime'].dt.day
        data['day_of_week'] = data['datetime'].dt.day_of_week
        return data

    def create_client_features(self, client):
        '''💼 Create client features 💼'''
        # Modify column names - specify suffix
        client = self.create_new_column_names(client,
                                           suffix='_client',
                                           columns_no_change = self.client_join
                                          )
        client['data_block_id']-=2
        return client

    def create_historical_weather_features(self, historical_weather):
        '''⌛🌤️ Create historical weather features 🌤️⌛'''

        # To datetime
        historical_weather['datetime'] = pd.to_datetime(historical_weather['datetime'])

        # Add county
        historical_weather[self.lat_lon_columns] = historical_weather[self.lat_lon_columns].astype(float).round(1)
        historical_weather = historical_weather.merge(location, how = 'left', on = self.lat_lon_columns)

        # Modify column names - specify suffix
        historical_weather = self.create_new_column_names(historical_weather,
                                                          suffix='_h',
                                                          columns_no_change = self.lat_lon_columns + self.weather_join
                                                          )

        # Group by & calculate aggregate stats
        agg_columns = [col for col in historical_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        historical_weather = historical_weather.groupby(self.weather_join).agg(agg_dict).reset_index()
        # Flatten the multi column aggregates
        historical_weather = self.flatten_multi_index_columns(historical_weather)

        # Test set has 1 day offset for hour<11 and 2 day offset for hour>11
        # historical_weather['hour_h'] = historical_weather['datetime'].dt.hour
        # historical_weather['datetime'] = (historical_weather
        #                                        .apply(lambda x:
        #                                               x['datetime'] + pd.DateOffset(1)
        #                                               if x['hour_h']< 11
        #                                               else x['datetime'] + pd.DateOffset(2),
        #                                               axis=1)
        #                                       )
        historical_weather['data_block_id'] = historical_weather['data_block_id'].astype(int)
        historical_weather['data_block_id'] -= 1

        return historical_weather

    def create_forecast_weather_features(self, forecast_weather):
        '''🔮🌤️ Create forecast weather features 🌤️🔮'''

        # Rename column and drop
        forecast_weather = (forecast_weather
                            .rename(columns = {'forecast_datetime': 'datetime'})
                            .drop(columns = 'origin_datetime')
                           )

        # To datetime
        forecast_weather['datetime'] = (pd.to_datetime(forecast_weather['datetime'])
                                        .dt
                                        .tz_localize(None)
                                       )

        # Add county
        forecast_weather[self.lat_lon_columns] = forecast_weather[self.lat_lon_columns].astype(float).round(1)
        forecast_weather = forecast_weather.merge(location, how = 'left', on = self.lat_lon_columns)

        # Modify column names - specify suffix
        forecast_weather = self.create_new_column_names(forecast_weather,
                                                        suffix='_f',
                                                        columns_no_change = self.lat_lon_columns + self.weather_join
                                                        )

        # Group by & calculate aggregate stats
        agg_columns = [col for col in forecast_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        forecast_weather = forecast_weather.groupby(self.weather_join).agg(agg_dict).reset_index()

        # Flatten the multi column aggregates
        forecast_weather = self.flatten_multi_index_columns(forecast_weather)
        forecast_weather['data_block_id'] -= 1
        return forecast_weather

    def create_electricity_features(self, electricity):
        '''⚡ Create electricity prices features ⚡'''
        # To datetime
        electricity['forecast_date'] = pd.to_datetime(electricity['forecast_date'])

        # Test set has 1 day offset
        electricity['datetime'] = electricity['forecast_date']

        # Modify column names - specify suffix
        electricity = self.create_new_column_names(electricity,
                                                   suffix='_electricity',
                                                   columns_no_change = self.electricity_join)
        electricity['data_block_id']-=1
        return electricity

    def create_gas_features(self, gas):
        '''⛽ Create gas prices features ⛽'''
        gas['forecast_date'] = pd.to_datetime(gas['forecast_date'])
        gas['datetime'] = gas['forecast_date']

        # Mean gas price
        gas['mean_price_per_mwh'] = (gas['lowest_price_per_mwh'] + gas['highest_price_per_mwh'])/2

        # Modify column names - specify suffix
        gas = self.create_new_column_names(gas,
                                           suffix='_gas',
                                           columns_no_change = self.gas_join
                                          )
        gas['data_block_id']-=1

        return gas

    def get_holiday_features(self, df, country_code='US'):

        year_range = list(range(min(df['datetime'].dt.year), max(df['datetime'].dt.year) + 1))
        country_holidays = holidays.country_holidays(
        country_code,
        years=year_range,
        observed=False
        )
        holiday = pd.DataFrame(country_holidays.items())
        holiday.columns = ['date', 'holiday']
        holiday['date'] = pd.to_datetime(holiday['date'])
        holiday = holiday.rename(columns={'date': 'datetime'})
        holiday['datetime'] = pd.to_datetime(holiday['datetime'])

        return holiday
    def __call__(self, data, client, historical_weather, forecast_weather, electricity, gas):
        '''Processing of features from all datasets, merge together and return features for dataframe df '''
        # Create features for relevant dataset
        data = self.create_data_features(data)
        client = self.create_client_features(client)
        historical_weather = self.create_historical_weather_features(historical_weather)
        forecast_weather = self.create_forecast_weather_features(forecast_weather)
        electricity = self.create_electricity_features(electricity)
        gas = self.create_gas_features(gas)
        holiday = self.get_holiday_features(data)
        # 🔗 Merge all datasets into one df 🔗
        df = data.merge(client, how='left', on = self.client_join)
        df = df.merge(historical_weather, how='left', on = self.weather_join)
        df = df.merge(forecast_weather, how='left', on = self.weather_join)
        df = df.merge(electricity, how='left', on = self.electricity_join)
        df = df.merge(gas, how='left', on = self.gas_join)
        df = df.merge(holiday, how='left', on = self.holiday)
        # Assuming 'df' is your DataFrame containing the 'holiday' column
        df['holiday'] = df['holiday'].fillna(0)  # Fill NaN values with 0
        df.loc[df['holiday'] != 0, 'holiday'] = 1  # Change non-zero values to 1


        # Change columns to categorical for XGBoost
        df[self.category_columns] = df[self.category_columns].astype('category')
        return df

In [None]:
def create_revealed_targets_train(data, N_day_lags):
    '''🎯 Create past revealed_targets for train set based on number of day lags N_day_lags 🎯 '''
    original_datetime = data['datetime']
    revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()

    # Create revealed targets for all day lags
    for day_lag in range(2, N_day_lags+1):
        revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_lag)
        data = data.merge(revealed_targets,
                          how='left',
                          on = ['datetime', 'prediction_unit_id', 'is_consumption'],
                          suffixes = ('', f'_{day_lag}_days_ago')
                         )
    return data


In [None]:
%%time
# Create all features

FeatureProcessor = FeatureProcessorClass()

data = FeatureProcessor(data = train_df.copy(),
                      client = client_df.copy(),
                      historical_weather = hw_df.copy(),
                      forecast_weather = fw_df.copy(),
                      electricity = electricity_df.copy(),
                      gas = gas_df.copy(),
                     )


In [None]:
N_day_lags = 7 # Specify how many days we want to go back (at least 2)

df = create_revealed_targets_train(data.copy(),
                                  N_day_lags = N_day_lags)

In [None]:
# Remove empty target row
target = 'target'
df = df[df[target].notnull()].reset_index(drop=True)

In [None]:
#### Create single fold split ######
train_block_id = list(range(0, 600))

tr = df[df['data_block_id'].isin(train_block_id)] # first 600 data_block_ids used for training
val = df[~df['data_block_id'].isin(train_block_id)] # rest data_block_ids used for validation

In [None]:
# Remove columns for features
no_features = ['date',
                'latitude',
                'longitude',
                'data_block_id',
                'row_id',
                'hours_ahead',
                'hour_h',
               ]

remove_columns = [col for col in df.columns for no_feature in no_features if no_feature in col]
remove_columns.append(target)
features = [col for col in df.columns if col not in remove_columns]


In [None]:
tr_features = np.array(tr[features])
val_features = np.array(val[features])

# reshape features for LSTM: [samples, timesteps, features]
tr_features = np.array(tr_features).reshape((tr_features.shape[0], 1, tr_features.shape[1]))
val_features = np.array(val_features).reshape((val_features.shape[0], 1, val_features.shape[1]))

In [None]:

# Define the model
model = Sequential()
model.add(Bidirectional(LSTM(1024,
                              return_sequences=True,
                              activation='relu',
                              input_shape=(tr_features.shape[1], tr_features.shape[2]))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=True, activation='relu'))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=False, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer=Adam(), loss=MeanAbsoluteError(), metrics=['acc'])  # Example optimizer, loss, and metrics
model.build(input_shape=(None, tr_features.shape[1], tr_features.shape[2]))

# Now you can access the summary
model.summary()


In [None]:
earlyStop = EarlyStopping(
    monitor="val_loss",
    verbose=1,          # verbose mode will print out extra information
    mode='min',         # the training will stop when the quantity monitored has stopped decreasing
    patience=5          # number of epochs with no improvement after which training will be stopped
)

# fit the LSTM model to the training data
history = model.fit(
    tr_features, tr[target],                       # training data and labels
    epochs=15,                             # maximum number of epochs to run
    batch_size=1024,                        # batch size for training
    validation_data=(val_features, val[target]),       # validation data for evaluating the model
    callbacks=[earlyStop],                  # list of callbacks, in this case just EarlyStopping
    verbose=1,                              # verbose mode will print out extra information per epoch
    shuffle=False                           # don't shuffle the data, usually important in time series
)