# Pipeline v3 - no stacking

Taking the original pipeline without stacking column transformers. Let's go!

In [17]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

In [2]:
data = pd.read_csv("raw_data/train.csv")

In [4]:
f_logs = [
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]

In [6]:
f_time = ['time']

In [7]:
f_degree = ['sun_azimuth:d', 'wind_dir_2m:d', 'wind_dir_10m:d', 'wind_dir_50m:d', 'wind_dir_100m:d']

In [8]:
f_ohe = ['precip_type:idx']

In [9]:
f_only_scaling = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'sun_elevation:d',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]

In [12]:
# len(f_logs) + len(f_degree) + len(f_only_scaling) ## checking that we're account for all the features

## Functions for the FunctionTransformers

In [78]:
# def log_transformed(data):
#     for col in f_logs:
#         data[col] = np.log(data[col] + 1e-5)
#     return data

# log_ft = FunctionTransformer(log_transformed)

log_ft = FunctionTransformer(lambda feature: np.log(feature + 1e-5))

In [79]:
def time_transformed(data):
    """takes a df of datetime features and splits it into three features each: hour, month, season"""
    # feature = pd.to_datetime(data.time, format='%Y/%m/%d %H:%M:%S')
    feature = pd.to_datetime(data.time)
    
    hour = feature.dt.hour
    month  = feature.dt.month

    def assign_season(month):
        if month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        elif month in [9, 10, 11]:
            return 3  # Fall
        else:  # December, January, February
            return 4  # Winter

    season = month.apply(assign_season)
    hour_sine = np.sin(2 * np.pi * hour / 24)
    hour_cosine = np.cos(2 * np.pi * hour / 24)
    month_sine = np.sin(2 * np.pi * month / 12)
    month_cosine = np.cos(2 * np.pi * month / 12)
    season_sine = np.sin(2 * np.pi * season / 4)
    season_cosine = np.cos(2 * np.pi * season / 4)

    df = pd.DataFrame({
        "hour_sine": hour_sine, 
        "hour_cosine": hour_cosine, 
        "month_sine": month_sine, 
        "month_cosine": month_cosine, 
        "season_sine": season_sine, 
        "season_cosine": season_cosine})

    return df

time_ft = FunctionTransformer(time_transformed)

In [80]:
def degree_transformed(data):
    """ takes a df 'data' of features with degree units (eg angles) and creates a sin and cos column to make it cyclical"""
    df = pd.DataFrame({})
    
    for col in f_degree: #list(data.columns):
        sin_column = np.sin(2 * np.pi * data[col]/360)
        cos_column = np.cos(2 * np.pi * data[col]/360)

        df[f"sin_{col}"] = sin_column
        df[f"cos_{col}"] = cos_column

    return df

degree_ft = FunctionTransformer(degree_transformed)

## Creating the pipeline

- create dataframe without scaler
- - save column names in list
  - apply list to columns post preproc
  - 

### Transformer pipelines

In [82]:
log_transformer = make_pipeline(
    log_ft,
    minmax
)

In [83]:
time_transformer = make_pipeline(
    time_ft,
    minmax
)

In [84]:
degree_transformer = make_pipeline(
    degree_ft,
    minmax
)

### Scalers

In [163]:
ohe_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
minmax = MinMaxScaler()

### Preprocessor pipeline

In [145]:
preproc = make_column_transformer(
    (log_transformer, f_logs),
    (time_transformer, f_time),
    (degree_transformer, f_degree),
    (ohe_transformer, f_ohe),
    (minmax, f_only_scaling),
    remainder = "drop"
)

In [149]:
f_logs

['precip_1h:mm',
 'prob_precip_1h:p',
 'clear_sky_rad:W',
 'clear_sky_energy_1h:J',
 'diffuse_rad:W',
 'diffuse_rad_1h:Wh',
 'direct_rad:W',
 'direct_rad_1h:Wh',
 'global_rad:W',
 'global_rad_1h:Wh',
 'wind_speed_2m:ms',
 'wind_speed_10m:ms',
 'wind_speed_50m:ms',
 'wind_speed_100m:ms']

In [90]:
data_transformed = preproc.fit_transform(data)

In [161]:
data_transformed = pd.DataFrame(data_transformed)

In [162]:
data_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,0.0,0.0,0.757424,0.854846,0.781999,0.815432,0.571465,0.673655,0.746423,0.782939,...,0.807,0.489189,0.487936,0.486945,0.489060,0.541787,0.533724,0.528529,0.531722,0.534066
1,0.0,0.0,0.000000,0.779972,0.000000,0.742604,0.000000,0.526130,0.000000,0.708139,...,0.810,0.514865,0.510724,0.498695,0.494208,0.547550,0.539589,0.534535,0.537764,0.529670
2,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.963,0.490541,0.487936,0.481723,0.480051,0.541787,0.536657,0.528529,0.531722,0.534066
3,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.991,0.531081,0.529491,0.528721,0.528958,0.559078,0.554252,0.549550,0.552870,0.534066
4,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.999,0.737838,0.734584,0.724543,0.715573,0.610951,0.607038,0.603604,0.607251,0.505495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9510,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.136,0.937838,0.931635,0.909922,0.891892,0.371758,0.366569,0.363363,0.371601,0.268132
9511,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.053,0.962162,0.954424,0.922977,0.898327,0.365994,0.360704,0.360360,0.368580,0.257143
9512,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.285,0.952703,0.943700,0.908616,0.879022,0.357349,0.351906,0.351351,0.362538,0.252747
9513,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.264,0.947297,0.936997,0.896867,0.864865,0.345821,0.340176,0.345345,0.353474,0.246154


In [158]:
# preproc_1 = make_column_transformer(
#     (log_ft, f_logs),
#     (time_ft, f_time),
#     (degree_ft, f_degree),
#     (ohe_transformer, f_ohe),
#     remainder = "passthrough"
# )

In [159]:
# data_transformed_1 = preproc_1.fit_transform(data)

In [160]:
# data_transformed_1 = pd.DataFrame(data_transformed_1, columns=preproc_1.get_feature_names_out())

In [151]:
data_transformed.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
0,0.0,0.0,0.757424,0.854846,0.781999,0.815432,0.571465,0.673655,0.746423,0.782939,...,0.807,0.489189,0.487936,0.486945,0.48906,0.541787,0.533724,0.528529,0.531722,0.534066


In [150]:
data.head(1)

Unnamed: 0,time,pv_production,wind_production,consumption,spot_market_price,precip_1h:mm,precip_type:idx,prob_precip_1h:p,clear_sky_rad:W,clear_sky_energy_1h:J,...,t_50m:C,relative_humidity_50m:p,dew_point_50m:C,wind_speed_50m:ms,wind_dir_50m:d,t_100m:C,relative_humidity_100m:p,dew_point_100m:C,wind_speed_100m:ms,wind_dir_100m:d
0,2020-01-01 13:00:00,0.0,40.59,26.514689,0.28969,0.0,0.0,1.0,10.0,64826.0,...,8.4,60.7,1.3,8.4,246.3,8.3,60.3,1.0,10.4,247.3


In [144]:
data_transformed[0].mean()


np.float64(0.07302998401588674)

In [124]:
preproc.transformers_

[('pipeline-1',
  Pipeline(steps=[('functiontransformer',
                   FunctionTransformer(func=<function <lambda> at 0x7f9705915900>)),
                  ('minmaxscaler', MinMaxScaler())]),
  ['precip_1h:mm',
   'prob_precip_1h:p',
   'clear_sky_rad:W',
   'clear_sky_energy_1h:J',
   'diffuse_rad:W',
   'diffuse_rad_1h:Wh',
   'direct_rad:W',
   'direct_rad_1h:Wh',
   'global_rad:W',
   'global_rad_1h:Wh',
   'wind_speed_2m:ms',
   'wind_speed_10m:ms',
   'wind_speed_50m:ms',
   'wind_speed_100m:ms']),
 ('pipeline-2',
  Pipeline(steps=[('functiontransformer',
                   FunctionTransformer(func=<function time_transformed at 0x7f9705915120>)),
                  ('minmaxscaler', MinMaxScaler())]),
  ['time']),
 ('pipeline-3',
  Pipeline(steps=[('functiontransformer',
                   FunctionTransformer(func=<function degree_transformed at 0x7f97059176d0>)),
                  ('minmaxscaler', MinMaxScaler())]),
  ['sun_azimuth:d',
   'wind_dir_2m:d',
   'wind_dir_10m:d',

In [122]:
preproc.transformers_[0][2]

['precip_1h:mm',
 'prob_precip_1h:p',
 'clear_sky_rad:W',
 'clear_sky_energy_1h:J',
 'diffuse_rad:W',
 'diffuse_rad_1h:Wh',
 'direct_rad:W',
 'direct_rad_1h:Wh',
 'global_rad:W',
 'global_rad_1h:Wh',
 'wind_speed_2m:ms',
 'wind_speed_10m:ms',
 'wind_speed_50m:ms',
 'wind_speed_100m:ms']

In [126]:
# preproc[:-1].get_feature_names_out # ERROR

In [127]:
# preproc.get_feature_names_out() # ERROR

In [131]:
data.columns

Index(['time', 'pv_production', 'wind_production', 'consumption',
       'spot_market_price', 'precip_1h:mm', 'precip_type:idx',
       'prob_precip_1h:p', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:Wh', 'direct_rad:W',
       'direct_rad_1h:Wh', 'global_rad:W', 'global_rad_1h:Wh',
       'sunshine_duration_1h:min', 'sun_azimuth:d', 'sun_elevation:d',
       'low_cloud_cover:p', 'medium_cloud_cover:p', 'high_cloud_cover:p',
       'total_cloud_cover:p', 'effective_cloud_cover:p', 'temp',
       'relative_humidity_2m:p', 'dew_point_2m:C', 'wind_speed_2m:ms',
       'wind_dir_2m:d', 't_10m:C', 'relative_humidity_10m:p',
       'dew_point_10m:C', 'wind_speed_10m:ms', 'wind_dir_10m:d', 't_50m:C',
       'relative_humidity_50m:p', 'dew_point_50m:C', 'wind_speed_50m:ms',
       'wind_dir_50m:d', 't_100m:C', 'relative_humidity_100m:p',
       'dew_point_100m:C', 'wind_speed_100m:ms', 'wind_dir_100m:d'],
      dtype='object')

In [132]:
pd.DataFrame(data_transformed, columns=(f_logs))

ValueError: Shape of passed values is (9515, 53), indices imply (9515, 14)