In [1]:
!pip install -U xgboost -f /kaggle/input/xgboost-python-package/ --no-index

Looking in links: /kaggle/input/xgboost-python-package/


ERROR: Could not find a version that satisfies the requirement xgboost (from versions: none)
ERROR: No matching distribution found for xgboost


In [2]:


#General
import pandas as pd
import numpy as np
import json

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from colorama import Fore, Style, init;

# Modeling
import xgboost as xgb
import lightgbm as lgb
import torch

# Options
pd.set_option('display.max_columns', 100)
DEBUG = False 

In [3]:
def display_df(df, name):
    '''Display df shape and first row '''
    PrintColor(text = f'{name} data has {df.shape[0]} rows and {df.shape[1]} columns. \n ===> First row:')
    display(df.head(1))

# Color printing    
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    '''Prints color outputs using colorama of a text string'''
    print(style + color + text + Style.RESET_ALL); 

In [4]:
CSV_DIR = "predict-energy-behavior-of-prosumers/"

train = pd.read_csv(CSV_DIR + "train.csv")
client = pd.read_csv(CSV_DIR + "client.csv")
historical_weather = pd.read_csv(CSV_DIR + "historical_weather.csv")
forecast_weather = pd.read_csv(CSV_DIR + "forecast_weather.csv")
electricity = pd.read_csv(CSV_DIR + "electricity_prices.csv")
gas = pd.read_csv(CSV_DIR + "gas_prices.csv")

In [6]:
display_df(train, 'train')
display_df(client, 'client')
display_df(historical_weather, 'historic weather')
display_df(forecast_weather, 'forecast weather')
display_df(electricity, 'electricity prices')
display_df(gas, 'gas prices')

[1m[34mtrain data has 2018352 rows and 9 columns. 
 ===> First row:[0m


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0


[1m[34mclient data has 41919 rows and 7 columns. 
 ===> First row:[0m


Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2


[1m[34mhistoric weather data has 1710800 rows and 18 columns. 
 ===> First row:[0m


Unnamed: 0,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
0,2021-09-01 00:00:00,14.4,12.0,0.0,0.0,1015.8,4,4,0,0,6.694444,3,0.0,0.0,0.0,57.6,21.7,1


[1m[34mforecast weather data has 3424512 rows and 18 columns. 
 ===> First row:[0m


Unnamed: 0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
0,57.6,21.7,2021-09-01 00:00:00+00:00,1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,2021-09-01 01:00:00+00:00,0.0,0.0,0.0,0.0


[1m[34melectricity prices data has 15286 rows and 4 columns. 
 ===> First row:[0m


Unnamed: 0,forecast_date,euros_per_mwh,origin_date,data_block_id
0,2021-09-01 00:00:00,92.51,2021-08-31 00:00:00,1


[1m[34mgas prices data has 637 rows and 5 columns. 
 ===> First row:[0m


Unnamed: 0,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
0,2021-09-01,45.23,46.32,2021-08-31,1


In [7]:
class FeatureProcessorClass():
    def __init__(self):         
        # Columns to join on for the different datasets
        self.historical_weather_join = ['datetime', 'data_block_id']
        self.forecast_weather_join = ['datetime', 'data_block_id'] 
        self.gas_join = ['data_block_id']
        self.electricity_join = ['datetime', 'data_block_id']
        self.client_join = ['county', 'is_business', 'product_type', 'data_block_id']
        
        # Categorical columns (specify for XGBoost)
        self.category_columns = ['county', 'is_business', 'product_type', 'is_consumption', 'data_block_id']

    def create_new_column_names(self, df, suffix, columns_no_change):
        '''Change column names by given suffix, keep columns_no_change, and return back the data'''
        df.columns = [col + suffix 
                      if col not in columns_no_change
                      else col
                      for col in df.columns
                      ]
        return df 
    
    def create_data_features(self, data):
        '''📊Create features for main data (test or train) set📊'''
        # To datetime
        data['datetime'] = pd.to_datetime(data['datetime'])
        
        # Time period features
        data['date'] = data['datetime'].dt.normalize()
        data['year'] = data['datetime'].dt.year
        data['quarter'] = data['datetime'].dt.quarter
        data['month'] = data['datetime'].dt.month
        data['week'] = data['datetime'].dt.isocalendar().week
        data['hour'] = data['datetime'].dt.hour
        
        # Day features
        data['day_of_year'] = data['datetime'].dt.day_of_year
        data['day_of_month']  = data['datetime'].dt.day
        data['day_of_week'] = data['datetime'].dt.day_of_week
        return data

    def create_client_features(self, client):
        '''💼 Create client features 💼'''
        # Modify column names - specify suffix
        client = self.create_new_column_names(client, 
                                           suffix='_client',
                                           columns_no_change = self.client_join
                                          )       
        return client
    
    def create_historical_weather_features(self, historical_weather):
        '''⌛🌤️ Create historical weather features 🌤️⌛'''
        # To datetime
        historical_weather['datetime'] = pd.to_datetime(historical_weather['datetime'])
        
        # Group by mean for datetime & data_block_id (note: the latitude/longitude not taken into account)
        historical_weather_mean = historical_weather.groupby(['datetime', 'data_block_id']).mean().reset_index() 
        
        # Test set has 1 day offset for hour<11 and 2 day offset for hour>11
        historical_weather_mean['hour'] = historical_weather_mean['datetime'].dt.hour
        historical_weather_mean['datetime'] = (historical_weather_mean
                                               .apply(lambda x: 
                                                      x['datetime'] + pd.DateOffset(1) 
                                                      if x['hour']< 11 
                                                      else x['datetime'] + pd.DateOffset(2),
                                                      axis=1)
                                              )
        
        # Modify column names - specify suffix
        historical_weather_mean = self.create_new_column_names(historical_weather_mean,
                                                               suffix='_historical',
                                                               columns_no_change = self.historical_weather_join
                                                              )                                      
        return historical_weather_mean
    
    def create_forecast_weather_features(self, forecast_weather):
        '''🔮🌤️ Create forecast weather features 🌤️🔮'''
        # Rename column
        forecast_weather = forecast_weather.rename(columns = {'forecast_datetime': 'datetime'})   
        
        # To datetime
        forecast_weather['origin_datetime'] = pd.to_datetime(forecast_weather['origin_datetime']).dt.tz_localize(None)
        forecast_weather['datetime'] = pd.to_datetime(forecast_weather['datetime']).dt.tz_localize(None)

        # Groupby mean for origin_datetime & hours_ahead & data_block_id (note: the latitude/longitude not taken into account)
        forecast_weather_mean = forecast_weather.groupby(['origin_datetime', 'hours_ahead', 'data_block_id']).mean().reset_index() 
        
        # Modify column names - specify suffix
        forecast_weather_mean = self.create_new_column_names(forecast_weather_mean, 
                                                             suffix='_forecast',
                                                             columns_no_change = self.forecast_weather_join
                                                            )                                     
        return forecast_weather_mean

    def create_electricity_features(self, electricity):
        '''⚡ Create electricity prices features ⚡'''
        # To datetime
        electricity['forecast_date'] = pd.to_datetime(electricity['forecast_date'])
        
        # Test set has 1 day offset
        electricity['datetime'] = electricity['forecast_date'] + pd.DateOffset(1)
        
        # Modify column names - specify suffix
        electricity = self.create_new_column_names(electricity, 
                                                   suffix='_electricity',
                                                   columns_no_change = self.electricity_join
                                                  )             
        return electricity

    def create_gas_features(self, gas):
        '''⛽ Create gas prices features ⛽'''
        # Mean gas price
        gas['mean_price_per_mwh'] = (gas['lowest_price_per_mwh'] + gas['highest_price_per_mwh'])/2
        
        # Modify column names - specify suffix
        gas = self.create_new_column_names(gas, 
                                           suffix='_gas',
                                           columns_no_change = self.gas_join
                                          )       
        return gas
    
    def __call__(self, data, client, historical_weather, forecast_weather, electricity, gas):
        '''Processing of features from all datasets, merge together and return features for dataframe df '''
        # Create features for relevant dataset
        data = self.create_data_features(data)
        client = self.create_client_features(client)
        historical_weather = self.create_historical_weather_features(historical_weather)
        forecast_weather = self.create_forecast_weather_features(forecast_weather)
        electricity = self.create_electricity_features(electricity)
        gas = self.create_gas_features(gas)
        
        # 🔗 Merge all datasets into one df 🔗
        df = data.merge(client, how='left', on = self.client_join)
        df = df.merge(historical_weather, how='left', on = self.historical_weather_join)
        df = df.merge(forecast_weather, how='left', on = self.forecast_weather_join)
        df = df.merge(electricity, how='left', on = self.electricity_join)
        df = df.merge(gas, how='left', on = self.gas_join)
        
        # Change columns to categorical for XGBoost
        df[self.category_columns] = df[self.category_columns].astype('category')
        return df

In [8]:
def create_revealed_targets_train(data, N_day_shifts):
    '''🎯 Create new train data based on N_day_shifts 🎯 '''    
    original_datetime = data['datetime']
    revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()
    
    # Create revealed targets for all day shifts
    for day_shift in range(2, N_day_shifts+1):
        revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_shift)
        data = data.merge(revealed_targets, 
                          how='left', 
                          on = ['datetime', 'prediction_unit_id', 'is_consumption'],
                          suffixes = ('', f'_{day_shift}_days_ago')
                         )
    return data

In [9]:
%%time
# Create all features
N_day_shifts = 10 # Specify how many days we want to go back (at least 2)

FeatureProcessor = FeatureProcessorClass()

data = FeatureProcessor(data = train.copy(),
                      client = client.copy(),
                      historical_weather = historical_weather.copy(),
                      forecast_weather = forecast_weather.copy(),
                      electricity = electricity.copy(),
                      gas = gas.copy(),
                     )

df = create_revealed_targets_train(data.copy(), 
                                  N_day_shifts = N_day_shifts)

  forecast_weather_mean = forecast_weather.groupby(['origin_datetime', 'hours_ahead', 'data_block_id']).mean().reset_index()


KeyError: 'datetime'