In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer


In [4]:
data = pd.read_csv("raw_data/train.csv")

In [5]:
data.columns

Index(['time', 'pv_production', 'wind_production', 'consumption',
       'spot_market_price', 'precip_1h:mm', 'precip_type:idx',
       'prob_precip_1h:p', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:Wh', 'direct_rad:W',
       'direct_rad_1h:Wh', 'global_rad:W', 'global_rad_1h:Wh',
       'sunshine_duration_1h:min', 'sun_azimuth:d', 'sun_elevation:d',
       'low_cloud_cover:p', 'medium_cloud_cover:p', 'high_cloud_cover:p',
       'total_cloud_cover:p', 'effective_cloud_cover:p', 'temp',
       'relative_humidity_2m:p', 'dew_point_2m:C', 'wind_speed_2m:ms',
       'wind_dir_2m:d', 't_10m:C', 'relative_humidity_10m:p',
       'dew_point_10m:C', 'wind_speed_10m:ms', 'wind_dir_10m:d', 't_50m:C',
       'relative_humidity_50m:p', 'dew_point_50m:C', 'wind_speed_50m:ms',
       'wind_dir_50m:d', 't_100m:C', 'relative_humidity_100m:p',
       'dew_point_100m:C', 'wind_speed_100m:ms', 'wind_dir_100m:d'],
      dtype='object')

# Which features need which scaling? 

Putting them into lists for the pipeline

In [6]:
# targets = ['pv_production', 'wind_production', 'consumption']

f_minmax = [
    'hour_sine',
    'hour_cosine',
    'month_sine',
    'month_cosine',
    'season_sine',
    'season_cosine',
    'log_precip_1h:mm', 
    'log_prob_precip_1h:p', 
    'log_clear_sky_rad:W',
    'log_clear_sky_energy_1h:J',
    'log_diffuse_rad:W', 
    'log_diffuse_rad_1h:Wh', 
    'log_direct_rad:W',
    'log_direct_rad_1h:Wh', 
    'log_global_rad:W', 
    'log_global_rad_1h:Wh',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'sin_sun_azimuth:d',
    'cos_sun_azimuth:d',
    'sin_wind_dir_2m:d',
    'cos_wind_dir_2m:d',
    'sin_wind_dir_10m:d',
    'cos_wind_dir_10m:d',
    'sin_wind_dir_50m:d',
    'cos_wind_dir_50m:d',
    'sin_wind_dir_100m:d',
    'cos_wind_dir_100m:d',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]
f_standard = ['sun_elevation:d']
f_robust = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'log_wind_speed_2m:ms',
    'log_wind_speed_10m:ms', 
    'log_wind_speed_50m:ms',
    'log_wind_speed_100m:ms',
]

f_ohe = ['precip_type:idx']


# other = ['spot_market_price']

# Log functions and other encoding

In [7]:
f_logs = [
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]


In [8]:
epsilon = 1e-5

for f in f_logs:
    data[f] = np.log(data[f] + epsilon)

In [9]:
# Converting time to datetime
#We might have done that before already
data['time']= pd.to_datetime(data['time'])

#the following two steps creates new columns to get the input for the sine & cosine columns
#creating columns indicating the hour and the month
data['hour'] = data['time'].dt.hour
data['month'] = data['time'].dt.month

#creating column indicating the season
def assign_season(month):
    if month in [3, 4, 5]:
        return 1  # Spring
    elif month in [6, 7, 8]:
        return 2  # Summer
    elif month in [9, 10, 11]:
        return 3  # Fall
    else:  # December, January, February
        return 4  # Winter

data['season'] = data['month'].apply(assign_season)

data['hour_sine'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cosine'] = np.cos(2 * np.pi * data['hour'] / 24)

data['month_sine'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cosine'] = np.cos(2 * np.pi * data['month'] / 12)

data['season_sine'] = np.sin(2 * np.pi * data['season'] / 4)
data['season_cosine'] = np.cos(2 * np.pi * data['season'] / 4)

data = data.drop(columns = ['hour', 'month', 'season'])

# Creating FunctionTransformers for the above

## Log FunctionTransformer

In [10]:
#Log columns
f_logs = [
    'precip_1h:mm',
    'prob_precip_1h:p',
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W',
    'diffuse_rad_1h:Wh',
    'direct_rad:W',
    'direct_rad_1h:Wh',
    'global_rad:W',
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms',
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]

In [11]:
log_transformer = FunctionTransformer(lambda feature: np.log(feature + 1e-5))

## Time FunctionTransformer

In [12]:

def time_transformed(feature):
    """takes a datetime feature and splits it into three features: hour, month, season"""
    pd.to_datetime(feature)
    
    hour = feature.dt.hour
    month  = feature.dt.month

    def assign_season(month):
        if month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        elif month in [9, 10, 11]:
            return 3  # Fall
        else:  # December, January, February
            return 4  # Winter

    season = month.apply(assign_season)
    hour_sine = np.sin(2 * np.pi * hour / 24)
    hour_cosine = np.cos(2 * np.pi * hour / 24)
    month_sine = np.sin(2 * np.pi * month / 12)
    month_cosine = np.cos(2 * np.pi * month / 12)
    season_sine = np.sin(2 * np.pi * season / 4)
    season_cosine = np.cos(2 * np.pi * season / 4)

    df = pd.DataFrame({
        "hour_sine": hour_sine, 
        "hour_cosine": hour_cosine, 
        "month_sine": month_sine, 
        "month_cosine": month_cosine, 
        "season_sine": season_sine, 
        "season_cosine": season_cosine})

    return df

time_transformer = FunctionTransformer(time_transformed)


In [13]:
time_transformed(data.time)

Unnamed: 0,hour_sine,hour_cosine,month_sine,month_cosine,season_sine,season_cosine
0,-0.258819,-0.965926,0.5,0.866025,-2.449294e-16,1.0
1,-0.500000,-0.866025,0.5,0.866025,-2.449294e-16,1.0
2,-0.707107,-0.707107,0.5,0.866025,-2.449294e-16,1.0
3,-0.866025,-0.500000,0.5,0.866025,-2.449294e-16,1.0
4,-0.965926,-0.258819,0.5,0.866025,-2.449294e-16,1.0
...,...,...,...,...,...,...
9510,-0.965926,0.258819,0.5,0.866025,-2.449294e-16,1.0
9511,-0.866025,0.500000,0.5,0.866025,-2.449294e-16,1.0
9512,-0.707107,0.707107,0.5,0.866025,-2.449294e-16,1.0
9513,-0.500000,0.866025,0.5,0.866025,-2.449294e-16,1.0


In [14]:
# HOW TO CHANGE THE COLUMN NAMES 

## 360 degree FunctionTransformer

In [15]:
cyclical_features = ['sun_azimuth:d', 'wind_dir_2m:d', 'wind_dir_10m:d', 'wind_dir_50m:d', 'wind_dir_100m:d']

In [16]:
def degrees_transformed(feat):
    sin_column = np.sin(2 * np.pi * feat/360)
    cos_column = np.cos(2 * np.pi * feat/360)
    
    df = pd.DataFrame({
        f"sin_{feat.name}": sin_column,
        f"cos_{feat.name}": cos_column
    })

    return df


# Testing preproc pipeline

In [18]:
data = pd.read_csv("raw_data/train.csv")

In [25]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer


# create the lists of features for each transformer
# write the functions and the FunctionTransformers for each list 
# create the pipeline


## LISTS BY SCALER

f_minmax = [
    'hour_sine',
    'hour_cosine',
    'month_sine',
    'month_cosine',
    'season_sine',
    'season_cosine',
    'precip_1h:mm', 
    'prob_precip_1h:p', 
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W', 
    'diffuse_rad_1h:Wh', 
    'direct_rad:W',
    'direct_rad_1h:Wh', 
    'global_rad:W', 
    'global_rad_1h:Wh',
    'sunshine_duration_1h:min',
    'low_cloud_cover:p', 
    'medium_cloud_cover:p', 
    'high_cloud_cover:p',
    'total_cloud_cover:p', 
    'effective_cloud_cover:p',
    'sin_sun_azimuth:d',
    'cos_sun_azimuth:d',
    'sin_wind_dir_2m:d',
    'cos_wind_dir_2m:d',
    'sin_wind_dir_10m:d',
    'cos_wind_dir_10m:d',
    'sin_wind_dir_50m:d',
    'cos_wind_dir_50m:d',
    'sin_wind_dir_100m:d',
    'cos_wind_dir_100m:d',
    'relative_humidity_2m:p', 
    'relative_humidity_10m:p',
    'relative_humidity_50m:p',
    'relative_humidity_100m:p',
    'dew_point_2m:C',
    'dew_point_10m:C', 
    'dew_point_50m:C',
    'dew_point_100m:C',
    'temp'
]
f_standard = ['sun_elevation:d']
f_robust = [
    't_10m:C',
    't_50m:C',
    't_100m:C',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms', 
    'wind_speed_50m:ms',
    'wind_speed_100m:ms',
]

f_ohe = ['precip_type:idx']


## LISTS BY FUNCTIONTRANSFORMER

f_logs = [
    'precip_1h:mm',
    'prob_precip_1h:p',
    'clear_sky_rad:W',
    'clear_sky_energy_1h:J',
    'diffuse_rad:W',
    'diffuse_rad_1h:Wh',
    'direct_rad:W',
    'direct_rad_1h:Wh',
    'global_rad:W',
    'global_rad_1h:Wh',
    'wind_speed_2m:ms',
    'wind_speed_10m:ms',
    'wind_speed_50m:ms',
    'wind_speed_100m:ms'
]

f_time = ["time"]

f_degree = ['sun_azimuth:d', 'wind_dir_2m:d', 'wind_dir_10m:d', 'wind_dir_50m:d', 'wind_dir_100m:d']


## FUNCTION TRANSFORMERS 

log_transformer = FunctionTransformer(lambda feature: np.log(feature + 1e-5))

def time_transformed(data):
    """takes a df of datetime features and splits it into three features each: hour, month, season"""
    feature = pd.to_datetime(data.time, format='%Y/%m/%d %H:%M:%S')
    
    hour = feature.dt.hour
    month  = feature.dt.month

    def assign_season(month):
        if month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        elif month in [9, 10, 11]:
            return 3  # Fall
        else:  # December, January, February
            return 4  # Winter

    season = month.apply(assign_season)
    hour_sine = np.sin(2 * np.pi * hour / 24)
    hour_cosine = np.cos(2 * np.pi * hour / 24)
    month_sine = np.sin(2 * np.pi * month / 12)
    month_cosine = np.cos(2 * np.pi * month / 12)
    season_sine = np.sin(2 * np.pi * season / 4)
    season_cosine = np.cos(2 * np.pi * season / 4)

    df = pd.DataFrame({
        "hour_sine": hour_sine, 
        "hour_cosine": hour_cosine, 
        "month_sine": month_sine, 
        "month_cosine": month_cosine, 
        "season_sine": season_sine, 
        "season_cosine": season_cosine})

    return df

time_transformer = FunctionTransformer(time_transformed)

def degree_transformed(data):
    """ takes a df 'data' of features with degree units (eg angles) and creates a sin and cos column to make it cyclical"""
    df = pd.DataFrame({})
    
    for col in f_degree: #list(data.columns):
        sin_column = np.sin(2 * np.pi * data[col]/360)
        cos_column = np.cos(2 * np.pi * data[col]/360)

        df[f"sin_{col}"] = sin_column
        df[f"cos_{col}"] = cos_column

    return df

degree_transformer = FunctionTransformer(degree_transformed)


## CREATING THE PIPELINE

# Preprocessing Pipeline
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output = False)
robust_scaler = RobustScaler()

preproc_1 = make_column_transformer(
    (log_transformer, f_logs),
    # (time_transformer, f_time),
    (degree_transformer, f_degree),
    remainder='passthrough'
)

preproc_2 = make_column_transformer(
    (minmax_scaler, f_minmax),
    (standard_scaler, f_standard),
    (robust_scaler, f_robust),
    (ohe, f_ohe),
    remainder='passthrough'
)

### CHAIN PREPROC_1 and PREPROC_2

preproc = make_pipeline(
    preproc_1,
    preproc_2
)

In [26]:
data_transformed = preproc.fit_transform(data)

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:
data.head()