In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

In [2]:
data = pd.read_csv("raw_data/train.csv")

In [3]:
f_logs = [
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]
f_time = ['time']
f_degree = ['sun_azimuth:d', 'wind_dir_2m:d', 'wind_dir_10m:d', 'wind_dir_50m:d', 'wind_dir_100m:d']
f_ohe = ['precip_type:idx']
f_only_scaling = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'sun_elevation:d',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]

## Custom Functions 

In [4]:
data = pd.read_csv("raw_data/train.csv")

In [5]:
def log_transformed(data):
    """ replaces values in columns in a dataframe with the log values """
    for col in f_logs:
        data[col] = np.log(data[col] + 1e-5)
    return data

In [6]:
len(log_transformed(data).columns)

44

In [7]:
def time_transformed(data):
    """takes a df of datetime features and splits it into three features each: hour, month, season"""
    # feature = pd.to_datetime(data.time, format='%Y/%m/%d %H:%M:%S')
    feature = pd.to_datetime(data.time)
    
    hour = feature.dt.hour
    month  = feature.dt.month

    def assign_season(month):
        if month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        elif month in [9, 10, 11]:
            return 3  # Fall
        else:  # December, January, February
            return 4  # Winter

    season = month.apply(assign_season)
    hour_sine = np.sin(2 * np.pi * hour / 24)
    hour_cosine = np.cos(2 * np.pi * hour / 24)
    month_sine = np.sin(2 * np.pi * month / 12)
    month_cosine = np.cos(2 * np.pi * month / 12)
    season_sine = np.sin(2 * np.pi * season / 4)
    season_cosine = np.cos(2 * np.pi * season / 4)

    # df = pd.DataFrame({
    #     "hour_sine": hour_sine, 
    #     "hour_cosine": hour_cosine, 
    #     "month_sine": month_sine, 
    #     "month_cosine": month_cosine, 
    #     "season_sine": season_sine, 
    #     "season_cosine": season_cosine})

    data["hour_sine"] = hour_sine
    data["hour_cosine"] = hour_cosine
    data["month_sine"] = month_sine
    data["month_cosine"] = month_cosine
    data["season_sine"] = season_sine
    data["season_cosine"] = season_cosine

    data = data.drop(columns=["time"])

    return data

In [8]:
len(time_transformed(data).columns)

49

In [9]:
data = time_transformed(data)

In [10]:
def degree_transformed(data):
    """ takes a df 'data' of features with degree units (eg angles) and creates a sin and cos column to make it cyclical"""
    
    for col in f_degree: #list(data.columns):
        sin_column = np.sin(2 * np.pi * data[col]/360)
        cos_column = np.cos(2 * np.pi * data[col]/360)

        data[f"sin_{col}"] = sin_column
        data[f"cos_{col}"] = cos_column
        data = data.drop(columns=[col])

    return data

In [11]:
len(degree_transformed(data).columns)

54

In [12]:
degree_transformed(data).columns

Index(['pv_production', 'wind_production', 'consumption', 'spot_market_price',
       'precip_1h:mm', 'precip_type:idx', 'prob_precip_1h:p',
       'clear_sky_rad:W', 'clear_sky_energy_1h:J', 'diffuse_rad:W',
       'diffuse_rad_1h:Wh', 'direct_rad:W', 'direct_rad_1h:Wh', 'global_rad:W',
       'global_rad_1h:Wh', 'sunshine_duration_1h:min', 'sun_elevation:d',
       'low_cloud_cover:p', 'medium_cloud_cover:p', 'high_cloud_cover:p',
       'total_cloud_cover:p', 'effective_cloud_cover:p', 'temp',
       'relative_humidity_2m:p', 'dew_point_2m:C', 'wind_speed_2m:ms',
       't_10m:C', 'relative_humidity_10m:p', 'dew_point_10m:C',
       'wind_speed_10m:ms', 't_50m:C', 'relative_humidity_50m:p',
       'dew_point_50m:C', 'wind_speed_50m:ms', 't_100m:C',
       'relative_humidity_100m:p', 'dew_point_100m:C', 'wind_speed_100m:ms',
       'hour_sine', 'hour_cosine', 'month_sine', 'month_cosine', 'season_sine',
       'season_cosine', 'sin_sun_azimuth:d', 'cos_sun_azimuth:d',
       'sin_win

In [13]:
# WE WANT 54 COLUMNS

In [14]:
len(f_degree)

5

# Pipeline building

In [15]:
data = pd.read_csv("raw_data/train.csv")

In [16]:
og_col = list(data.columns)
len(og_col)

44

In [17]:
# data = log_transformed(data)
# data = time_transformed(data)
# data = degree_transformed(data)

data_ft = degree_transformed(time_transformed(log_transformed(data)))

In [18]:
all_col = list(data_ft.columns)

In [19]:
len(all_col)

54

In [20]:
all_col

['pv_production',
 'wind_production',
 'consumption',
 'spot_market_price',
 'precip_1h:mm',
 'precip_type:idx',
 'prob_precip_1h:p',
 'clear_sky_rad:W',
 'clear_sky_energy_1h:J',
 'diffuse_rad:W',
 'diffuse_rad_1h:Wh',
 'direct_rad:W',
 'direct_rad_1h:Wh',
 'global_rad:W',
 'global_rad_1h:Wh',
 'sunshine_duration_1h:min',
 'sun_elevation:d',
 'low_cloud_cover:p',
 'medium_cloud_cover:p',
 'high_cloud_cover:p',
 'total_cloud_cover:p',
 'effective_cloud_cover:p',
 'temp',
 'relative_humidity_2m:p',
 'dew_point_2m:C',
 'wind_speed_2m:ms',
 't_10m:C',
 'relative_humidity_10m:p',
 'dew_point_10m:C',
 'wind_speed_10m:ms',
 't_50m:C',
 'relative_humidity_50m:p',
 'dew_point_50m:C',
 'wind_speed_50m:ms',
 't_100m:C',
 'relative_humidity_100m:p',
 'dew_point_100m:C',
 'wind_speed_100m:ms',
 'hour_sine',
 'hour_cosine',
 'month_sine',
 'month_cosine',
 'season_sine',
 'season_cosine',
 'sin_sun_azimuth:d',
 'cos_sun_azimuth:d',
 'sin_wind_dir_2m:d',
 'cos_wind_dir_2m:d',
 'sin_wind_dir_10m:d',


In [21]:
# diffs = [col for col in og_col if col not in all_col]

In [22]:
drop_col = ['pv_production',
            'wind_production',
            'consumption',
            'spot_market_price',
            'precip_type:idx']

In [23]:
scale_col = [col for col in all_col if col not in drop_col and f_ohe]

In [24]:
len(scale_col)

49

In [25]:
minmax = MinMaxScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

In [26]:
preproc = make_column_transformer(
    (ohe, f_ohe),
    (minmax, scale_col),
    remainder = "drop"
)

In [27]:
data_transformed = preproc.fit_transform(data_ft)

In [28]:
data_transformed = pd.DataFrame(data_transformed, columns=preproc.get_feature_names_out())

In [29]:
len(data_transformed.columns)

53

In [30]:
data_transformed.columns

Index(['onehotencoder__precip_type:idx_0.0',
       'onehotencoder__precip_type:idx_1.0',
       'onehotencoder__precip_type:idx_2.0',
       'onehotencoder__precip_type:idx_3.0', 'minmaxscaler__precip_1h:mm',
       'minmaxscaler__prob_precip_1h:p', 'minmaxscaler__clear_sky_rad:W',
       'minmaxscaler__clear_sky_energy_1h:J', 'minmaxscaler__diffuse_rad:W',
       'minmaxscaler__diffuse_rad_1h:Wh', 'minmaxscaler__direct_rad:W',
       'minmaxscaler__direct_rad_1h:Wh', 'minmaxscaler__global_rad:W',
       'minmaxscaler__global_rad_1h:Wh',
       'minmaxscaler__sunshine_duration_1h:min',
       'minmaxscaler__sun_elevation:d', 'minmaxscaler__low_cloud_cover:p',
       'minmaxscaler__medium_cloud_cover:p',
       'minmaxscaler__high_cloud_cover:p', 'minmaxscaler__total_cloud_cover:p',
       'minmaxscaler__effective_cloud_cover:p', 'minmaxscaler__temp',
       'minmaxscaler__relative_humidity_2m:p', 'minmaxscaler__dew_point_2m:C',
       'minmaxscaler__wind_speed_2m:ms', 'minmaxscaler__t

In [31]:
data_transformed

Unnamed: 0,onehotencoder__precip_type:idx_0.0,onehotencoder__precip_type:idx_1.0,onehotencoder__precip_type:idx_2.0,onehotencoder__precip_type:idx_3.0,minmaxscaler__precip_1h:mm,minmaxscaler__prob_precip_1h:p,minmaxscaler__clear_sky_rad:W,minmaxscaler__clear_sky_energy_1h:J,minmaxscaler__diffuse_rad:W,minmaxscaler__diffuse_rad_1h:Wh,...,minmaxscaler__sin_sun_azimuth:d,minmaxscaler__cos_sun_azimuth:d,minmaxscaler__sin_wind_dir_2m:d,minmaxscaler__cos_wind_dir_2m:d,minmaxscaler__sin_wind_dir_10m:d,minmaxscaler__cos_wind_dir_10m:d,minmaxscaler__sin_wind_dir_50m:d,minmaxscaler__cos_wind_dir_50m:d,minmaxscaler__sin_wind_dir_100m:d,minmaxscaler__cos_wind_dir_100m:d
0,1.0,0.0,0.0,0.0,0.0,0.0,0.757424,0.854846,0.781999,0.815432,...,0.307401,0.038568,0.045382,0.291860,0.045382,0.291860,0.042169,0.299026,0.038731,0.307049
1,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.779972,0.000000,0.742604,...,0.205451,0.095955,0.023405,0.348815,0.023405,0.348815,0.023669,0.347984,0.024203,0.346324
2,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.121531,0.173244,0.018654,0.364700,0.018654,0.364700,0.019129,0.363021,0.019853,0.360507
3,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.058350,0.265584,0.017037,0.370590,0.017037,0.370590,0.017952,0.367222,0.018891,0.363862
4,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.017398,0.369241,0.031970,0.324079,0.031970,0.324079,0.031664,0.324897,0.031056,0.326534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9510,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.010188,0.600413,0.142764,0.150168,0.142764,0.150168,0.110331,0.186698,0.074594,0.237266
9511,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.051479,0.720968,0.183310,0.113080,0.183310,0.113080,0.147064,0.145830,0.107072,0.190797
9512,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.130026,0.836329,0.187379,0.109785,0.187379,0.109785,0.149545,0.143375,0.107612,0.190112
9513,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.250030,0.933029,0.204697,0.096520,0.204697,0.096520,0.160279,0.133135,0.110878,0.186020
