In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

In [28]:
data = pd.read_csv("raw_data/train.csv")

In [3]:
f_logs = [
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]
f_time = ['time']
f_degree = ['sun_azimuth:d', 'wind_dir_2m:d', 'wind_dir_10m:d', 'wind_dir_50m:d', 'wind_dir_100m:d']
f_ohe = ['precip_type:idx']
f_only_scaling = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'sun_elevation:d',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]

## Custom Functions 

In [25]:
def log_transformed(data):
    for col in f_logs:
        data[col] = np.log(data[col] + 1e-5)
        data = data.drop(columns=[col])
    return data

In [21]:
# log_transformed(data)

In [17]:
def time_transformed(data):
    """takes a df of datetime features and splits it into three features each: hour, month, season"""
    # feature = pd.to_datetime(data.time, format='%Y/%m/%d %H:%M:%S')
    feature = pd.to_datetime(data.time)
    
    hour = feature.dt.hour
    month  = feature.dt.month

    def assign_season(month):
        if month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        elif month in [9, 10, 11]:
            return 3  # Fall
        else:  # December, January, February
            return 4  # Winter

    season = month.apply(assign_season)
    hour_sine = np.sin(2 * np.pi * hour / 24)
    hour_cosine = np.cos(2 * np.pi * hour / 24)
    month_sine = np.sin(2 * np.pi * month / 12)
    month_cosine = np.cos(2 * np.pi * month / 12)
    season_sine = np.sin(2 * np.pi * season / 4)
    season_cosine = np.cos(2 * np.pi * season / 4)

    # df = pd.DataFrame({
    #     "hour_sine": hour_sine, 
    #     "hour_cosine": hour_cosine, 
    #     "month_sine": month_sine, 
    #     "month_cosine": month_cosine, 
    #     "season_sine": season_sine, 
    #     "season_cosine": season_cosine})

    data["hour_sine"] = hour_sine
    data["hour_cosine"] = hour_cosine
    data["month_sine"] = month_sine
    data["month_cosine"] = month_cosine
    data["season_sine"] = season_sine
    data["season_cosine"] = season_cosine

    data = data.drop(columns=["time"])

    return data

In [14]:
# time_transformed(data)

In [15]:
def degree_transformed(data):
    """ takes a df 'data' of features with degree units (eg angles) and creates a sin and cos column to make it cyclical"""
    df = pd.DataFrame({})
    
    for col in f_degree: #list(data.columns):
        sin_column = np.sin(2 * np.pi * data[col]/360)
        cos_column = np.cos(2 * np.pi * data[col]/360)

        data[f"sin_{col}"] = sin_column
        data[f"cos_{col}"] = cos_column

    return data

In [22]:
# degree_transformed(data)

# Pipeline building

In [56]:
data = pd.read_csv("raw_data/train.csv")

In [57]:
# data = log_transformed(data)
# data = time_transformed(data)
# data = degree_transformed(data)

data = degree_transformed(time_transformed(log_transformed(data)))

In [58]:
all_col = list(data.columns)

In [59]:
drop_col = ['pv_production',
            'wind_production',
            'consumption',
            'spot_market_price',
            'precip_type:idx']

In [60]:
scale_col = [col for col in all_col if col not in drop_col and f_ohe]

In [61]:
# scale_col

In [62]:
minmax = MinMaxScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

In [64]:
preproc = make_column_transformer(
    (ohe, f_ohe),
    (minmax, scale_col),
    remainder = "drop"
)

In [65]:
data_transformed = preproc.fit_transform(data)

In [66]:
data_transformed = pd.DataFrame(data_transformed, columns=preproc.get_feature_names_out())

In [68]:
data_transformed.columns

Index(['onehotencoder__precip_type:idx_0.0',
       'onehotencoder__precip_type:idx_1.0',
       'onehotencoder__precip_type:idx_2.0',
       'onehotencoder__precip_type:idx_3.0',
       'minmaxscaler__sunshine_duration_1h:min', 'minmaxscaler__sun_azimuth:d',
       'minmaxscaler__sun_elevation:d', 'minmaxscaler__low_cloud_cover:p',
       'minmaxscaler__medium_cloud_cover:p',
       'minmaxscaler__high_cloud_cover:p', 'minmaxscaler__total_cloud_cover:p',
       'minmaxscaler__effective_cloud_cover:p', 'minmaxscaler__temp',
       'minmaxscaler__relative_humidity_2m:p', 'minmaxscaler__dew_point_2m:C',
       'minmaxscaler__wind_dir_2m:d', 'minmaxscaler__t_10m:C',
       'minmaxscaler__relative_humidity_10m:p',
       'minmaxscaler__dew_point_10m:C', 'minmaxscaler__wind_dir_10m:d',
       'minmaxscaler__t_50m:C', 'minmaxscaler__relative_humidity_50m:p',
       'minmaxscaler__dew_point_50m:C', 'minmaxscaler__wind_dir_50m:d',
       'minmaxscaler__t_100m:C', 'minmaxscaler__relative_humidi