In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("raw_data/train.csv")

In [4]:
data.columns

Index(['time', 'pv_production', 'wind_production', 'consumption',
       'spot_market_price', 'precip_1h:mm', 'precip_type:idx',
       'prob_precip_1h:p', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:Wh', 'direct_rad:W',
       'direct_rad_1h:Wh', 'global_rad:W', 'global_rad_1h:Wh',
       'sunshine_duration_1h:min', 'sun_azimuth:d', 'sun_elevation:d',
       'low_cloud_cover:p', 'medium_cloud_cover:p', 'high_cloud_cover:p',
       'total_cloud_cover:p', 'effective_cloud_cover:p', 'temp',
       'relative_humidity_2m:p', 'dew_point_2m:C', 'wind_speed_2m:ms',
       'wind_dir_2m:d', 't_10m:C', 'relative_humidity_10m:p',
       'dew_point_10m:C', 'wind_speed_10m:ms', 'wind_dir_10m:d', 't_50m:C',
       'relative_humidity_50m:p', 'dew_point_50m:C', 'wind_speed_50m:ms',
       'wind_dir_50m:d', 't_100m:C', 'relative_humidity_100m:p',
       'dew_point_100m:C', 'wind_speed_100m:ms', 'wind_dir_100m:d'],
      dtype='object')

# Which features need which scaling? 

Putting them into lists for the pipeline

In [6]:
# targets = ['pv_production', 'wind_production', 'consumption']

f_minmax = [
    'hour_sine',
    'hour_cosine',
    'month_sine',
    'month_cosine',
    'season_sine',
    'season_cosine',
    'log_precip_1h:mm', 
    'log_prob_precip_1h:p', 
    'log_clear_sky_rad:W',
    'log_clear_sky_energy_1h:J',
    'log_diffuse_rad:W', 
    'log_diffuse_rad_1h:Wh', 
    'log_direct_rad:W',
    'log_direct_rad_1h:Wh', 
    'log_global_rad:W', 
    'log_global_rad_1h:Wh',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'sin_sun_azimuth:d',
    'cos_sun_azimuth:d',
    'sin_wind_dir_2m:d',
    'cos_wind_dir_2m:d',
    'sin_wind_dir_10m:d',
    'cos_wind_dir_10m:d',
    'sin_wind_dir_50m:d',
    'cos_wind_dir_50m:d',
    'sin_wind_dir_100m:d',
    'cos_wind_dir_100m:d',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]
f_standard = ['sun_elevation:d']
f_robust = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'log_wind_speed_2m:ms',
    'log_wind_speed_10m:ms', 
    'log_wind_speed_50m:ms',
    'log_wind_speed_100m:ms',
]

f_ohe = ['precip_type:idx']


# other = ['spot_market_price']

# Log functions and other encoding

In [8]:
f_logs = [
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]


14

In [9]:
epsilon = 1e-5

for f in f_logs:
    data[f] = np.log(data[f] + epsilon)

In [None]:
# Converting time to datetime
#We might have done that before already
data['time']= pd.to_datetime(data['time'])

#the following two steps creates new columns to get the input for the sine & cosine columns
#creating columns indicating the hour and the month
data['hour'] = data['time'].dt.hour
data['month'] = data['time'].dt.month

#creating column indicating the season
def assign_season(month):
    if month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    elif month in [9, 10, 11]:
        return 3  # Fall
    else:  # December, January, February
        return 4  # Winter

data['season'] = data['month'].apply(assign_season)

data['hour_sine'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cosine'] = np.cos(2 * np.pi * data['hour'] / 24)

data['month_sine'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cosine'] = np.cos(2 * np.pi * data['month'] / 12)

data['season_sine'] = np.sin(2 * np.pi * data['season'] / 4)
data['season_cosine'] = np.cos(2 * np.pi * data['season'] / 4)

data = data.drop(columns = ['hour', 'month', 'season'])