#### Voeg je code toe in de behorende codeblok

In [None]:
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df = pd.read_csv('trainingsset.csv')
df.columns = df.columns.str.lower()

pd.set_option('display.max_columns', None)

In [None]:
df.info()

#### Data cleaning

In [None]:
# hashtags naar None
df.replace('#', None, inplace=True)

# onrealistische waarden naar None
df.loc[df['reistijd_a'] < 28, 'reistijd_a'] = None
df.loc[df['reistijd_c'] == 0, 'reistijd_c'] = None
df.loc[df['reistijd_a'] == None, 'reistijd_totaal'] = None
df.loc[df['reistijd_c'] == None, 'reistijd_totaal'] = None

## Feature engineering

#### Tijd

In [None]:
# feature engineering m.b.t. starttijd

#### Wind

In [None]:
# feature engineering m.b.t. windrichting, windkracht en windsnelheid

#### Random feature engineering die ik random heb uitgevoerd weet niet of het handig gaat zijn maargoed
dit gaat nog worden opgeschoond en verdeeld onder andere functies

In [None]:
# in geval niet gedaan bij tijd

def plink(df):
    df['start_hour'] = pd.to_datetime(df['datumtijd_startpunt']).dt.hour
    df['start_day_of_week'] = pd.to_datetime(df['datumtijd_startpunt']).dt.dayofweek
    df['start_month'] = pd.to_datetime(df['datumtijd_startpunt']).dt.month
    df['start_week_of_year'] = pd.to_datetime(df['datumtijd_startpunt']).dt.isocalendar().week
    df['is_weekend'] = df['start_day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    # in geval niet gedaan bij wind
    df['wind_speed_ship_length_interaction'] = df['windsnelheid'] * df['lengte_schip_huidig']
    df['wind_direction_trip_distance_interaction'] = df['windrichting'] * df['vaarafstand_totaal_in_km']
    df['total_wind_impact'] = df.groupby(['windrichting'])['windsnelheid'].transform('sum')
    df['mean_wind_speed_per_harbor'] = df.groupby('havenbekken')['windsnelheid'].transform('mean')

    wind_bins = [0, 8, 16, 20, 24] 
    wind_labels = ['calm', 'windy', 'windy-stormy', 'stormy']
    df['wind_speed_category'] = pd.cut(df['windsnelheid'], bins=wind_bins, labels=wind_labels)

    df['wind_speed_direction_interaction'] = df['windsnelheid'] * df['windrichting']

    # logging
    df['log_ship_length'] = np.log1p(df['lengte_schip_huidig'])
    df['log_trip_distance'] = np.log1p(df['vaarafstand_totaal_in_km'])

    # normalize
    for col in ['vaarafstand_a_in_km', 'vaarafstand_b_in_km', 'vaarafstand_c_in_km', 'vaarafstand_totaal_in_km']:
        df[col + '_normalized'] = (df[col] - df[col].mean()) / df[col].std()

    # bekken 
    berth_usage = df['berth_name'].value_counts().to_dict()
    df['berth_utilization'] = df['berth_name'].map(berth_usage)

    havenbekken_usage = df['havenbekken'].value_counts().to_dict()
    df['bekken_utilization'] = df['havenbekken'].map(havenbekken_usage)



## Scaling

In [None]:
def scaling(df, is_train=True):
    # first define which columns to scale (do not scale one-hot encoded columns and target columns)
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    non_numerical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
    exclusions = ['id', 'reistijd_a', 'reistijd_b', 'reistijd_c', 'reistijd_totaal'] # exclude targets and one-hot encoded data 

    boolean_like_cols = [col for col in numerical_cols if df[col].dropna().isin([0, 1]).all()]
    
    cols_to_scale = [col for col in numerical_cols if col not in exclusions and col not in boolean_like_cols] 
    cols_not_to_scale = non_numerical_cols + exclusions + boolean_like_cols
    
    # if it's training data, fit the scaler and save it. If it's test data, load the scaler
    if is_train:
        scaler = StandardScaler()
        scaler.fit(df[cols_to_scale])
        
        with open('scaler.pkl', 'wb') as file:
            pickle.dump(scaler, file)
    else: 
        with open('scaler.pkl', 'rb') as file:
            scaler = pickle.load(file)
         
    scaled_data = scaler.transform(df[cols_to_scale])
    scaled_df = pd.DataFrame(scaled_data, columns=cols_to_scale)
    scaled_df = pd.concat([scaled_df, df[cols_not_to_scale].reset_index()], axis=1)
    
    return scaled_df

## Encoding

In [None]:
# category_limit meaning the number of unique values in a column 
def onehot_encode(df, columns_to_encode=None):
    # transform objects to category
    object_columns = df.select_dtypes(include='object').columns
    df[object_columns] = df[object_columns].astype('category')
    
    # Get columns to one-hot encode if it exists given by a parameter in the function
    columns_to_encode = [col for col in columns_to_encode if col in df.columns]
    
    # One-hot encode the appropriate columns
    df = pd.get_dummies(df, columns=columns_to_encode, drop_first=False, dtype=int)

    return df


# Hier komt een dictionary met een key,value pair van de column, mappings -> mapping is een dict van categorical -> ordinal values
def ordinal_encode(df, encoding_dict=None):
    # transform objects to category
    object_columns = df.select_dtypes(include='object').columns
    df[object_columns] = df[object_columns].astype('category')
    
    #Get columns to ordinal  encode if it exists given by a parameter in the function
    for k, v in encoding_dict.items():
        df[k] = df[k].map(v)
    
    return df

## Pipeline
ga ik maken zodra feature engineering van wind en tijd is afgerond