In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_A = x_test_est_B_resampled
test_A = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_A = test_A.dropna()
test_A = test_A.dropna()

In [None]:
def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

In [None]:
def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

In [None]:
# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

In [None]:
def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

In [None]:
def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])

In [None]:
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

In [None]:
def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

In [None]:
def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

In [None]:
def preprocessing(df):
    df = add_experimental_features(df)
    df = add_date_features(df)
    df = add_binned_features(df)
    df = add_rate_of_change_features_to_df(df)
    df = add_est_obs_feature(df)
    df = remove_constant_regions(df)
    df = add_lagged_features_to_df(df)
    df = handle_nan(df)

    return df

# Preprocess
obs_A = preprocessing(obs_A)
est_A = preprocessing(est_A)
test_A = preprocessing(test_A)

obs_B = preprocessing(obs_B)
est_B = preprocessing(est_B)
test_B = preprocessing(test_A)

obs_C = preprocessing(obs_C)
est_C = preprocessing(est_C)
test_C = preprocessing(test_A)

In [None]:
# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3

# Best score is the mean prediction of all the 5 seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(32)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

In [None]:
train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)


In [None]:
# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_A[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_A[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

In [None]:
# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

In [None]:
auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})


In [None]:
xgb_weight = 0.4
cat_weight = 0.3
auto_weight = 0.3

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_1 = predictions

# Create an id array
ids = np.arange(0, len(predictions_1))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_1
})

# Save to CSV
df.to_csv('predictions_1.csv', index=False)

In [None]:
# Sequence 2

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3

# Best score is the mean prediction of all the 5 seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(24)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.4
cat_weight = 0.3
auto_weight = 0.3

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_2 = predictions

# Create an id array
ids = np.arange(0, len(predictions_2))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_2
})

# Save to CSV
df.to_csv('predictions_2.csv', index=False)

In [None]:
# Sequence 3

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3

# Best score is the mean prediction of all the 5 seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(33)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.2
cat_weight = 0.4
auto_weight = 0.4

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_3 = predictions

# Create an id array
ids = np.arange(0, len(predictions_3))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_3
})

# Save to CSV
df.to_csv('predictions_3.csv', index=False)

In [None]:
# Sequence 4

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3

# Best score is the mean prediction of all the 5 seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(11)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.25
cat_weight = 0.35
auto_weight = 0.4

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_4 = predictions

# Create an id array
ids = np.arange(0, len(predictions_4))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_4
})

# Save to CSV
df.to_csv('predictions_4.csv', index=False)

In [None]:
# Sequence 5

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3

# Best score is the mean prediction of all the 5 seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(5)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.4
cat_weight = 0.3
auto_weight = 0.3

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_5 = predictions

# Create an id array
ids = np.arange(0, len(predictions_5))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_5
})

# Save to CSV
df.to_csv('predictions_5.csv', index=False)

In [None]:
# Sequence 6

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3
# 6 weights 0.3, 0.4, 0.3

# Best score is the mean prediction of all the seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(6)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.3
cat_weight = 0.4
auto_weight = 0.3

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_6 = predictions

# Create an id array
ids = np.arange(0, len(predictions_6))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_6
})

# Save to CSV
df.to_csv('predictions_6.csv', index=False)

In [None]:
# Sequence 7

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3
# 6 weights 0.3, 0.4, 0.3
# 7 weights 0.3, 0.4, 0.3

# Best score is the mean prediction of all the seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(7)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.3
cat_weight = 0.4
auto_weight = 0.3

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_7 = predictions

# Create an id array
ids = np.arange(0, len(predictions_7))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_7
})

# Save to CSV
df.to_csv('predictions_7.csv', index=False)

In [None]:
# Sequence 8

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3
# 6 weights 0.3, 0.4, 0.3
# 7 weights 0.3, 0.4, 0.3
# 8 weights 0.3, 0.3, 0.4

# Best score is the mean prediction of all the seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(8)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.3
cat_weight = 0.3
auto_weight = 0.4

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_8 = predictions

# Create an id array
ids = np.arange(0, len(predictions_8))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_8
})

# Save to CSV
df.to_csv('predictions_8.csv', index=False)

In [None]:
# Sequence 9

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3
# 6 weights 0.3, 0.4, 0.3
# 7 weights 0.3, 0.4, 0.3
# 8 weights 0.3, 0.3, 0.4
# 9 weights 0.3, 0.3, 0.4

# Best score is the mean prediction of all the seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(9)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.3
cat_weight = 0.3
auto_weight = 0.4

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_9 = predictions

# Create an id array
ids = np.arange(0, len(predictions_9))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_9
})

# Save to CSV
df.to_csv('predictions_9.csv', index=False)

In [None]:
# Sequence 10

%reset -f

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from catboost import CatBoostRegressor
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

def add_experimental_features(df):
    """
    Experimental feature engineering.
    """

    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
    df['total_radiation_1h:J'] = df['direct_rad_1h:J'] + df['diffuse_rad_1h:J']
    df['rad_diff:W'] = df['direct_rad:W'] - df['diffuse_rad:W']
    df['rad_diff_1h:J'] = df['direct_rad_1h:J'] - df['diffuse_rad_1h:J']
    df['diffuse_direct_ratio'] = df['diffuse_rad:W'] / df['direct_rad:W']

    # Temperature and Pressure Features
    df['temp_dewpoint_diff'] = df['t_1000hPa:K'] - df['dew_point_2m:K']
    df['pressure_gradient'] = df['pressure_100m:hPa'] - df['pressure_50m:hPa']
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['dew_point_2m:C'] = df['dew_point_2m:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))
    df['sfc_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['sfc_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['wind_vector_magnitude'] = (df['wind_speed_u_10m:ms']**2 + df['wind_speed_v_10m:ms']**2 + df['wind_speed_w_1000hPa:ms']**2)**0.5
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Cloud and Snow Features
    df['cloud_humidity_product'] = df['total_cloud_cover:p'] * df['absolute_humidity_2m:gm3']
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Interaction between radiation and cloud cover
    df['radiation_cloud_interaction'] = df['direct_rad:W'] * df['effective_cloud_cover:p']

    # Interaction between temperature and radiation (considering that high temperature may reduce efficiency)
    df['temp_rad_interaction'] = df['t_1000hPa:K'] * df['total_radiation:W']

    # Interaction between wind cooling effect and temperature
    df['wind_temp_interaction'] = df['average_wind_speed'] * df['t_1000hPa:K']

    # Interaction between humidity and temperature
    df['humidity_temp_interaction'] = df['absolute_humidity_2m:gm3'] * df['t_1000hPa:K']

    # Interaction between humidity and radiation
    df['sun_elevation_direct_rad_interaction'] = df['sun_elevation:d'] * df['direct_rad:W']

    # Precipitation Features
    df['precip'] = df['precip_5min:mm']*df['precip_type_5min:idx']

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

def add_date_features(df):
    """
    Adds 'month', 'year', 'hour' and 'day' columns to the dataframe based on the 'date_forecast' column.
    Also adds 'hour_sin' and 'hour_cos' columns for the hour of the day.
    """
    
    # Check if 'date_forecast' exists in the dataframe
    if 'date_forecast' in df.columns:
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month
        df['year'] = df['date_forecast'].dt.year
        df['hour'] = df['date_forecast'].dt.hour
        df['day'] = df['date_forecast'].dt.day
        df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
        df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)

    else:
        print("Warning: 'date_forecast' column not found in the dataframe. No date features added.")
        return df  # Keep the 'date_forecast' column in the dataframe
    
    return df

# Adding discretized features for the continuous variables to help tree-based models

def bin_columns(dataframe, columns_to_bin, n_bins=5):
    """
    Bins the specified columns of the dataframe into equal-sized bins.
    
    Parameters:
    - dataframe: pd.DataFrame
    - columns_to_bin: list of strings, the names of the columns to bin
    - n_bins: int or dict, the number of bins for each column (if int, use the same number for all columns;
              if dict, specify individual numbers with column names as keys)
    
    Returns:
    - binned_dataframe: pd.DataFrame, the dataframe with the specified columns binned
    """
    binned_dataframe = dataframe.copy()
    
    for column in columns_to_bin:
        # Determine the number of bins for this column
        bins = n_bins if isinstance(n_bins, int) else n_bins.get(column, 5)
        
        # Create quantile-based bins
        binned_dataframe[f'binned_{column}'] = pd.qcut(
            binned_dataframe[column],
            q=bins,
            labels=False,
            duplicates='drop'
        )
        
    return binned_dataframe

def add_binned_features(df):
    columns_to_bin = [
        'super_cooled_liquid_water:kgm2',
        'ceiling_height_agl:m',
        'cloud_base_agl:m'
    ]

    # Bin the columns
    # df = bin_columns(df, columns_to_bin)
    df = bin_columns(df, ['effective_cloud_cover:p'], n_bins=2)
    df = bin_columns(df, ['ceiling_height_agl:m'], n_bins=3)
    df = bin_columns(df, ['average_wind_speed'], n_bins=5)

    return df

def add_rate_of_change_features(df, features, second_order=False):
    """
    Adds rate of change columns for specified features in the dataframe.
    Assumes the dataframe is time sorted. If second_order is True, it also adds the second order rate of change.
    """
    for feature in features:
        rate_column_name = feature + '_rate_of_change'
        df[rate_column_name] = df[feature].diff().fillna(0)  # Handle the first diff NaN if required
        
        if second_order:  # Check if second order difference is required
            second_order_column_name = feature + '_rate_of_change_of_change'
            df[second_order_column_name] = df[rate_column_name].diff().fillna(0)  # Second order difference

    return df

def add_rate_of_change_features_to_df(df):
    # Define the features for which to calculate rate of change
    features_to_diff = [
        't_1000hPa:K',
        'clear_sky_rad:W', 'diffuse_rad:W', 'direct_rad:W',
        'effective_cloud_cover:p', 'total_radiation:W'
    ]

    # Add rate of change features
    return add_rate_of_change_features(df, features_to_diff, second_order=False)

def add_est_obs_feature(df):
    """
    Adds a column to the dataframe that indicates whether the data is estimated or observed.
    """
    # Add the est_obs feature
    if 'date_calc' not in df.columns:
        # If 'date_calc' does not exist, create 'observed' column and set to 1
        df['observed'] = 1
        return df
    else:
        # If 'date_calc' exists, create a new column and set values to 0
        df['observed'] = 0
        return df.drop(columns=['date_calc'])
    
def remove_constant_regions(dataframe, column_name="pv_measurement", threshold=72):
    """
    Removes rows where the specified column has constant values for more than the given threshold.
    """
    
    # Check if the specified column exists in the dataframe
    if column_name not in dataframe.columns:
        print(f"Warning: '{column_name}' column not found in the dataframe. No rows removed.")
        return dataframe
    
    same_as_previous = dataframe[column_name].eq(dataframe[column_name].shift())
    group_ids = (~same_as_previous).cumsum()
    to_remove = group_ids[same_as_previous].value_counts() > threshold
    group_ids_to_remove = to_remove[to_remove].index
    
    # Drop entire rows that match the conditions
    return dataframe.drop(dataframe[group_ids.isin(group_ids_to_remove)].index)

def add_lagged_features(df, features_with__lags, fill_value=None):
    """
    Adds lagged columns for specified features in the dataframe with specific lag periods.
    'features_with_specific_lags' is a dictionary with features as keys and specific lag as values.
    'fill_value' is what to fill the NaNs with, after shifting.
    """
    for feature, specific_lag in features_with__lags.items():
        lag_column_name = f"{feature}_lag_{specific_lag}"
        df[lag_column_name] = df[feature].shift(specific_lag).fillna(fill_value)
    return df

def add_lagged_features_to_df(df):
    features_with_lags = {
        'total_radiation:W': 1,
        'total_radiation:W': -1,
        'rad_diff:W': 1,
        'rad_diff:W': -1,
        'total_radiation_1h:J': 1,
        'total_radiation_1h:J': -1
    }

    # Add lagged features for specific lags
    return add_lagged_features(df, features_with_lags, fill_value=0)

def handle_nan(df):
    # Remove the rows where target is nan
    try:
        df = df[df['pv_measurement'].notna()]
    except KeyError:
        pass

    # Set all remaining nans to 0
    return df.fillna(0)

# Aggregate the data to hourly with some aggregation methods for each column
aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}

# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

def preprocessing(df):
    df = add_experimental_features(df.copy())
    df = add_date_features(df.copy())
    df = add_binned_features(df.copy())
    df = add_rate_of_change_features_to_df(df.copy())
    df = add_est_obs_feature(df.copy())
    df = remove_constant_regions(df.copy())
    df = add_lagged_features_to_df(df.copy())
    df = handle_nan(df.copy())

    return df

# Preprocess
obs_A = preprocessing(obs_A.copy())
est_A = preprocessing(est_A.copy())
test_A = preprocessing(test_A.copy())

obs_B = preprocessing(obs_B.copy())
est_B = preprocessing(est_B.copy())
test_B = preprocessing(test_B.copy())

obs_C = preprocessing(obs_C.copy())
est_C = preprocessing(est_C.copy())
test_C = preprocessing(test_C.copy())

# Random seeds used for reproducibility
# 32 weights: 0.3, 0.3, 0.4
# 24 weights: 0.3, 0.3, 0.4
# 33 (without winter months 1 and 12) weights: 0.2, 0.4, 0.4
# 11 (without winter months 1, 2 and 11, 12) weights: 0.25, 0.35, 0.4
# 5 weights: 0.4, 0.3, 0.3
# 6 weights 0.3, 0.4, 0.3
# 7 weights 0.3, 0.4, 0.3
# 8 weights 0.3, 0.3, 0.4
# 9 weights 0.3, 0.3, 0.4
# 10 weights 0.3, 0.3, 0.4

# Best score is the mean prediction of all the seeds mentioned above. The first weight is xgboost, the second is catboost, and the third is autogluon.

# Set the random seed
np.random.seed(10)

# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Remove characters unparseable for CatBoost 
A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

# Getting validation data from summer months, because the test set is from summer months. We experimentet with excluding winter months
# from the training data here.

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.4), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
train_A = train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_B = train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
train_C = train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_A = val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_B = val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
val_C = val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_A = X_train_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_B = X_train_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_train_C = X_train_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_A = X_val_A.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_B = X_val_B.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
X_val_C = X_val_C.drop(columns=['date_forecast', 'date_forecast_x', 'date_forecast_y'])
test_A = test_A.drop(columns=['date_forecast'])
test_B = test_B.drop(columns=['date_forecast'])
test_C = test_C.drop(columns=['date_forecast'])

train_auto_A = TabularDataset(train_A)
val_auto_A = TabularDataset(val_A)

train_auto_B = TabularDataset(train_B)
val_auto_B = TabularDataset(val_B)

train_auto_C = TabularDataset(train_C)
val_auto_C = TabularDataset(val_C)

auto_label = 'pv_measurement'

# Set the parameters for the XGBoost models
params_xgb_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'num_parallel_tree': 2
}

params_xgb_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

params_xgb_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

xgb_A = xgb.XGBRegressor(**params_xgb_A)
xgb_B = xgb.XGBRegressor(**params_xgb_B)
xgb_C = xgb.XGBRegressor(**params_xgb_C)

cat_A = CatBoostRegressor(
    iterations=5000,         # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=5000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

# Prepare data for the XGBoost models. We got them to work the best when having fewer columns
xgb_columns = [
    'total_radiationW',
    'snow_accumulation',
    'super_cooled_liquid_waterkgm2',
    'average_wind_speed',
    'sun_elevationd',
    'sun_azimuthd',
    'clear_sky_radW',
    'month',
    't_1000hPaC',
    'msl_pressurehPa_scaled',
    'rain_waterkgm2',
    'cloud_base_aglm',
    'effective_cloud_coverp',
    'dew_or_rimeidx'
]
print(train_A.columns)

X_train_xgb_A = train_A[xgb_columns]
y_train_xgb_A = train_A['pv_measurement']
X_test_xgb_A = test_A[xgb_columns]

X_train_xgb_B = train_B[xgb_columns]
y_train_xgb_B = train_B['pv_measurement']
X_test_xgb_B = test_B[xgb_columns]

X_train_xgb_C = train_C[xgb_columns]
y_train_xgb_C = train_C['pv_measurement']
X_test_xgb_C = test_C[xgb_columns]

# Train the XGBoost models
xgb_A.fit(
    X=X_train_xgb_A, y=y_train_xgb_A,
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_xgb_B, y=y_train_xgb_B,
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_xgb_C, y=y_train_xgb_C,
    eval_metric='mae',
    verbose=False
)

# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True
)

auto_A = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_A, 
                                                                                   presets='medium_quality', 
                                                                                   tuning_data=val_auto_A, 
                                                                                   use_bag_holdout=True, 
                                                                                   ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_B = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_B,
                                                                                      presets='medium_quality',
                                                                                      tuning_data=val_auto_B,
                                                                                      use_bag_holdout=True,
                                                                                      ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

auto_C = TabularPredictor(label=auto_label, eval_metric='mean_absolute_error', problem_type='regression').fit(train_auto_C,
                                                                                        presets='medium_quality',
                                                                                        tuning_data=val_auto_C,
                                                                                        use_bag_holdout=True,
                                                                                        ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'})

xgb_weight = 0.3
cat_weight = 0.3
auto_weight = 0.4

pred_xgb_A = xgb_A.predict(X_test_xgb_C)
pred_xgb_B = xgb_B.predict(X_test_xgb_C)
pred_xgb_C = xgb_C.predict(X_test_xgb_C)

pred_auto_A = auto_A.predict(test_A)
pred_auto_B = auto_B.predict(test_B)
pred_auto_C = auto_C.predict(test_C)

pred_cat_A = cat_A.predict(test_A)
pred_cat_B = cat_B.predict(test_B)
pred_cat_C = cat_C.predict(test_C)

# Ensemble that seemed the best after some experimentation
pred_A = (pred_xgb_A*xgb_weight + pred_cat_A*cat_weight + pred_auto_A*auto_weight)
pred_B = (pred_xgb_B*xgb_weight + pred_cat_B*cat_weight + pred_auto_B*auto_weight)
pred_C = (pred_xgb_C*xgb_weight + pred_cat_C*cat_weight + pred_auto_C*auto_weight)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Save predictions
predictions_10 = predictions

# Create an id array
ids = np.arange(0, len(predictions_10))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions_10
})

# Save to CSV
df.to_csv('predictions_10.csv', index=False)

# **Create submission**

In [None]:
predictions_1 = pd.read_csv('predictions_1.csv')['prediction'].values
predictions_2 = pd.read_csv('predictions_2.csv')['prediction'].values
predictions_3 = pd.read_csv('predictions_3.csv')['prediction'].values
predictions_4 = pd.read_csv('predictions_4.csv')['prediction'].values
predictions_5 = pd.read_csv('predictions_5.csv')['prediction'].values
predictions_6 = pd.read_csv('predictions_6.csv')['prediction'].values
predictions_7 = pd.read_csv('predictions_7.csv')['prediction'].values
predictions_8 = pd.read_csv('predictions_8.csv')['prediction'].values
predictions_9 = pd.read_csv('predictions_9.csv')['prediction'].values
predictions_10 = pd.read_csv('predictions_10.csv')['prediction'].values

predictions = (predictions_1 + predictions_2 + predictions_3 + predictions_4 + predictions_5 + predictions_6 + predictions_7 + predictions_8 + predictions_9 + predictions_10) / 10

output_file = 'submission.csv'

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")