In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib
matplotlib.use('TkAgg')  # should fix matplotlib issues
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "Catboost_Multi.ipynb"


pd.options.display.max_columns = 100

In [2]:
A_X_train_observed = pd.read_parquet('./data/raw/A/X_train_observed.parquet')
A_X_train_estimated = pd.read_parquet('./data/raw/A/X_train_estimated.parquet')
A_X_test_estimated = pd.read_parquet('./data/raw/A/X_test_estimated.parquet')
A_train_targets = pd.read_parquet('./data/raw/A/train_targets.parquet')
B_X_train_observed = pd.read_parquet('./data/raw/B/X_train_observed.parquet')
B_X_train_estimated = pd.read_parquet('./data/raw/B/X_train_estimated.parquet')
B_X_test_estimated = pd.read_parquet('./data/raw/B/X_test_estimated.parquet')
B_train_targets = pd.read_parquet('./data/raw/B/train_targets.parquet')
C_X_train_observed = pd.read_parquet('./data/raw/C/X_train_observed.parquet')
C_X_train_estimated = pd.read_parquet('./data/raw/C/X_train_estimated.parquet')
C_X_test_estimated = pd.read_parquet('./data/raw/C/X_test_estimated.parquet')
C_train_targets = pd.read_parquet('./data/raw/C/train_targets.parquet')

#Load df to predict
y_predictons = pd.read_csv('data/raw/test.csv')


In [3]:
y_predictons['time'] = pd.to_datetime(y_predictons['time'])

In [4]:
# remove noisy features
def remove_night_light_discrepancies(df: pd.DataFrame):
    # remove all rows where pv_measure
    
    # step 1: Identify runs of equal, non-zero values
    df["group"] = (
        (df["pv_measurement"] != df["pv_measurement"].shift())
        | (df["pv_measurement"] == 0)
    ).cumsum()

    # step 2: Count occurances in each run
    counts = df.groupby("group")["pv_measurement"].transform("count")

    # step 3: Identify groups to remove
    to_remove = (counts >= 6) & (df["pv_measurement"] != 0)

    # step 4: Remove rows
    df_cleaned = df[~to_remove].drop(columns=["group"])
    return df_cleaned

# set all sun_elevation that are less than 0 to 0
def set_sun_elevation_to_zero(df: pd.DataFrame):
    df.loc[df['sun_elevation:d'] < 0, 'sun_elevation:d'] = 0
    return df

def add_sun_elevation_is_day_feature(df: pd.DataFrame):
    df['sun_elevation_is_day'] = df['sun_elevation:d'] * (df['is_day:idx'] - df["is_in_shadow:idx"])
    df = df.drop(columns=['is_in_shadow:idx', "is_day:idx", "sun_elevation:d"])
    return df

def resample_and_pivot(df, time_col='date_forecast'):
    # Ensure the date_forecast is a datetime type
    df[time_col] = pd.to_datetime(df[time_col])
    
    # Create a column that represents the 15-minute segment of the hour (00, 15, 30, 45)
    df['minutes'] = df[time_col].dt.minute
    
    # Remove the original 'date_forecast' column to avoid conflicts
    df_without_date = df.drop(columns=[time_col])
    
    # Pivot the 15-minute data to wide format
    df_pivoted = df_without_date.pivot_table(index=[df[time_col].dt.floor('H')],
                                             columns='minutes',
                                             values=[col for col in df.columns if col not in [time_col, 'minutes']],
                                             aggfunc='first')
    
    # Flatten the MultiIndex columns
    df_pivoted.columns = ['_'.join(map(str, col)).strip() for col in df_pivoted.columns.values]
    
    # Reset index to turn the grouped hourly timestamps back into a column
    df_pivoted.reset_index(inplace=True)
    
    # Rename the 'date_forecast' column to avoid the ValueError
    df_pivoted.rename(columns={'index': time_col}, inplace=True)
    
    return df_pivoted

def resample_and_max(pivoted_df, features):
    for feature in features:
        # Select columns for the current feature of interest
        feature_cols = [col for col in pivoted_df.columns if feature in col]
        
        # Find the maximum value across the selected columns for each hour
        max_feature_col_name = feature + '_max'
        pivoted_df[max_feature_col_name] = pivoted_df[feature_cols].max(axis=1)
        
        # Drop the original 15-minute interval columns for the current feature
        pivoted_df.drop(columns=feature_cols, inplace=True)
    
    # The resulting dataframe will have all other features intact, and additional columns
    # with the maximum value for the specified features over each hour
    return pivoted_df

In [5]:
columns_to_keep_A = [
    # Directly related to Solar Energy
    "clear_sky_energy_1h:J",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad:W", # max for hour
    "is_day:idx",
    "sun_azimuth:d",
    "sun_elevation:d",

    # Weather Conditions affecting Solar Panels
    "absolute_humidity_2m:gm3",
    "cloud_base_agl:m",
    "effective_cloud_cover:p",
    "is_in_shadow:idx",
    #"total_cloud_cover:p",

    # Temperature and Snow
    "dew_point_2m:K",
    #"fresh_snow_12h:cm",
    #"fresh_snow_1h:cm",
    #"fresh_snow_24h:cm",
    #"fresh_snow_3h:cm",
    #"fresh_snow_6h:cm",
    #"snow_depth:cm",
    "t_1000hPa:K",

    # Wind
    "wind_speed_10m:ms",

    # Miscellaneous
    "sfc_pressure:hPa",
    "date_forecast",
    "visibility:m",
]
columns_to_drop_A = []
for column in A_X_train_observed.columns:
    if column not in columns_to_keep_A:
        columns_to_drop_A.append(column)
print(columns_to_drop_A)

# Update DataFrames for solar panel A:
A_X_train_observed = A_X_train_observed.drop(columns=columns_to_drop_A)
A_X_train_estimated = A_X_train_estimated.drop(columns=columns_to_drop_A+['date_calc'])
A_X_test_estimated = A_X_test_estimated.drop(columns=columns_to_drop_A+['date_calc'])

A_train_targets = remove_night_light_discrepancies(A_train_targets)

# set sun_elevation to zero when the sun is below the horizon
A_X_train_observed = set_sun_elevation_to_zero(A_X_train_observed)
A_X_train_estimated = set_sun_elevation_to_zero(A_X_train_estimated)
A_X_test_estimated = set_sun_elevation_to_zero(A_X_test_estimated)

# add sun_elevation_is_day feature
A_X_train_observed = add_sun_elevation_is_day_feature(A_X_train_observed)
A_X_train_estimated = add_sun_elevation_is_day_feature(A_X_train_estimated)
A_X_test_estimated = add_sun_elevation_is_day_feature(A_X_test_estimated)

# Apply the function to each DataFrame
A_X_train_observed = resample_and_pivot(A_X_train_observed)
A_X_train_estimated = resample_and_pivot(A_X_train_estimated)
A_X_test_estimated = resample_and_pivot(A_X_test_estimated)

# Apply the function to each DataFrame
A_X_train_observed = resample_and_max(A_X_train_observed, features=['direct_rad:W'])
A_X_train_estimated = resample_and_max(A_X_train_estimated, features=['direct_rad:W'])
A_X_test_estimated = resample_and_max(A_X_test_estimated, features=['direct_rad:W'])

['air_density_2m:kgm3', 'ceiling_height_agl:m', 'dew_or_rime:idx', 'diffuse_rad_1h:J', 'direct_rad_1h:J', 'elevation:m', 'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2', 'super_cooled_liquid_water:kgm2', 'total_cloud_cover:p', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']


In [6]:
columns_to_keep_B = [
    # Directly related to Solar Energy
    "clear_sky_energy_1h:J",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad:W", # max for hour
    "is_day:idx",
    "sun_azimuth:d",
    "sun_elevation:d",

    # Weather Conditions affecting Solar Panels
    "absolute_humidity_2m:gm3",
    "cloud_base_agl:m",
    "effective_cloud_cover:p",
    "is_in_shadow:idx",
    #"total_cloud_cover:p",

    # Temperature and Snow
    "dew_point_2m:K",
    #"fresh_snow_12h:cm",
    #"fresh_snow_1h:cm",
    #"fresh_snow_24h:cm",
    #"fresh_snow_3h:cm",
    #"fresh_snow_6h:cm",
    #"snow_depth:cm",
    "t_1000hPa:K",

    # Wind
    "wind_speed_10m:ms",

    # Miscellaneous
    "sfc_pressure:hPa",
    "date_forecast",
    "visibility:m",
]
columns_to_drop_B = []
for column in B_X_train_observed.columns:
    if column not in columns_to_keep_B:
        columns_to_drop_B.append(column)
print(columns_to_drop_B)

# Update DataFrames for solar panel A:
B_X_train_observed = B_X_train_observed.drop(columns=columns_to_drop_B)
B_X_train_estimated = B_X_train_estimated.drop(columns=columns_to_drop_B+['date_calc'])
B_X_test_estimated = B_X_test_estimated.drop(columns=columns_to_drop_B+['date_calc'])

B_train_targets = remove_night_light_discrepancies(B_train_targets)

# set sun_elevation to zero when the sun is below the horizon
B_X_train_observed = set_sun_elevation_to_zero(B_X_train_observed)
B_X_train_estimated = set_sun_elevation_to_zero(B_X_train_estimated)
B_X_test_estimated = set_sun_elevation_to_zero(B_X_test_estimated)

# add sun_elevation_is_day feature
B_X_train_observed = add_sun_elevation_is_day_feature(B_X_train_observed)
B_X_train_estimated = add_sun_elevation_is_day_feature(B_X_train_estimated)
B_X_test_estimated = add_sun_elevation_is_day_feature(B_X_test_estimated)

# Apply the function to each DataFrame
B_X_train_observed = resample_and_pivot(B_X_train_observed)
B_X_train_estimated = resample_and_pivot(B_X_train_estimated)
B_X_test_estimated = resample_and_pivot(B_X_test_estimated)

# Apply the function to each DataFrame
B_X_train_observed = resample_and_max(B_X_train_observed, features=['direct_rad:W'])
B_X_train_estimated = resample_and_max(B_X_train_estimated, features=['direct_rad:W'])
B_X_test_estimated = resample_and_max(B_X_test_estimated, features=['direct_rad:W'])

['air_density_2m:kgm3', 'ceiling_height_agl:m', 'dew_or_rime:idx', 'diffuse_rad_1h:J', 'direct_rad_1h:J', 'elevation:m', 'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2', 'super_cooled_liquid_water:kgm2', 'total_cloud_cover:p', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']


In [7]:
columns_to_keep_C = [
    # Directly related to Solar Energy
    "clear_sky_energy_1h:J",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad:W", # max for hour
    "is_day:idx",
    "sun_azimuth:d",
    "sun_elevation:d",

    # Weather Conditions affecting Solar Panels
    "absolute_humidity_2m:gm3",
    "cloud_base_agl:m",
    "effective_cloud_cover:p",
    "is_in_shadow:idx",
    #"total_cloud_cover:p",

    # Temperature and Snow
    "dew_point_2m:K",
    #"fresh_snow_12h:cm",
    #"fresh_snow_1h:cm",
    #"fresh_snow_24h:cm",
    #"fresh_snow_3h:cm",
    #"fresh_snow_6h:cm",
    #"snow_depth:cm",
    "t_1000hPa:K",

    # Wind
    "wind_speed_10m:ms",

    # Miscellaneous
    "sfc_pressure:hPa",
    "date_forecast",
    "visibility:m",
]
columns_to_drop_C = []
for column in C_X_train_observed.columns:
    if column not in columns_to_keep_C:
        columns_to_drop_C.append(column)
print(columns_to_drop_C)

# Update DataFrames for solar panel A:
C_X_train_observed = C_X_train_observed.drop(columns=columns_to_drop_C)
C_X_train_estimated = C_X_train_estimated.drop(columns=columns_to_drop_C+['date_calc'])
C_X_test_estimated = C_X_test_estimated.drop(columns=columns_to_drop_C+['date_calc'])

C_train_targets = remove_night_light_discrepancies(C_train_targets)

# set sun_elevation to zero when the sun is below the horizon
C_X_train_observed = set_sun_elevation_to_zero(C_X_train_observed)
C_X_train_estimated = set_sun_elevation_to_zero(C_X_train_estimated)
C_X_test_estimated = set_sun_elevation_to_zero(C_X_test_estimated)

# add sun_elevation_is_day feature
C_X_train_observed = add_sun_elevation_is_day_feature(C_X_train_observed)
C_X_train_estimated = add_sun_elevation_is_day_feature(C_X_train_estimated)
C_X_test_estimated = add_sun_elevation_is_day_feature(C_X_test_estimated)

# Apply the function to each DataFrame
C_X_train_observed = resample_and_pivot(C_X_train_observed)
C_X_train_estimated = resample_and_pivot(C_X_train_estimated)
C_X_test_estimated = resample_and_pivot(C_X_test_estimated)

# Apply the function to each DataFrame
C_X_train_observed = resample_and_max(C_X_train_observed, features=['direct_rad:W'])
C_X_train_estimated = resample_and_max(C_X_train_estimated, features=['direct_rad:W'])
C_X_test_estimated = resample_and_max(C_X_test_estimated, features=['direct_rad:W'])

['air_density_2m:kgm3', 'ceiling_height_agl:m', 'dew_or_rime:idx', 'diffuse_rad_1h:J', 'direct_rad_1h:J', 'elevation:m', 'fresh_snow_12h:cm', 'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx', 'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx', 'snow_melt_10min:mm', 'snow_water:kgm2', 'super_cooled_liquid_water:kgm2', 'total_cloud_cover:p', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']


In [8]:
# Encode the categorical variable 'solar_pannel_location'
A_X_train_observed['solar_pannel_location'] = 'A'
A_X_train_estimated['solar_pannel_location'] = 'A'
A_X_test_estimated['solar_pannel_location'] = 'A'
B_X_train_observed['solar_pannel_location'] = 'B'
B_X_train_estimated['solar_pannel_location'] = 'B'
B_X_test_estimated['solar_pannel_location'] = 'B'
C_X_train_observed['solar_pannel_location'] = 'C'
C_X_train_estimated['solar_pannel_location'] = 'C'
C_X_test_estimated['solar_pannel_location'] = 'C'

# Encode the categorical variable 'solar_pannel_location'
A_train_targets['solar_pannel_location'] = 'A'
B_train_targets['solar_pannel_location'] = 'B'
C_train_targets['solar_pannel_location'] = 'C'


In [9]:
# Encode the categorical variable 'is_estimated'
A_X_train_observed['is_estimated'] = False
A_X_train_estimated['is_estimated'] = True
B_X_train_observed['is_estimated'] = False
B_X_train_estimated['is_estimated'] = True
C_X_train_observed['is_estimated'] = False
C_X_train_estimated['is_estimated'] = True

In [10]:
# A_X_train df's combined:
A_X_TRAIN = pd.concat([
    A_X_train_observed,
    A_X_train_estimated
], axis=0, ignore_index=True)

# B_X_train df's combined:
B_X_TRAIN = pd.concat([
    B_X_train_observed,
    B_X_train_estimated
], axis=0, ignore_index=True)

# C_X_train df's combined:
C_X_TRAIN = pd.concat([
    C_X_train_observed,
    C_X_train_estimated
], axis=0, ignore_index=True)


In [11]:
# Add time from apex
# Convert 'date_forecast' column to datetime
A_X_TRAIN['date_forecast'] = pd.to_datetime(A_X_TRAIN['date_forecast'])
B_X_TRAIN['date_forecast'] = pd.to_datetime(B_X_TRAIN['date_forecast'])
C_X_TRAIN['date_forecast'] = pd.to_datetime(C_X_TRAIN['date_forecast'])


# Convert 'time' column to datetime
A_X_test_estimated['date_forecast'] = pd.to_datetime(A_X_test_estimated['date_forecast'])
B_X_test_estimated['date_forecast'] = pd.to_datetime(B_X_test_estimated['date_forecast'])
C_X_test_estimated['date_forecast'] = pd.to_datetime(C_X_test_estimated['date_forecast'])

# Calculate the number of days away from day 171, considering year-end
A_X_TRAIN['days_from_apex'] = (A_X_TRAIN['date_forecast'].dt.dayofyear - 171).abs()
A_X_TRAIN['days_from_apex'] = A_X_TRAIN['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end
B_X_TRAIN['days_from_apex'] = (B_X_TRAIN['date_forecast'].dt.dayofyear - 171).abs()
B_X_TRAIN['days_from_apex'] = B_X_TRAIN['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end
C_X_TRAIN['days_from_apex'] = (C_X_TRAIN['date_forecast'].dt.dayofyear - 171).abs()
C_X_TRAIN['days_from_apex'] = C_X_TRAIN['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end

A_X_test_estimated['days_from_apex'] = (A_X_test_estimated['date_forecast'].dt.dayofyear - 171).abs()
A_X_test_estimated['days_from_apex'] = A_X_test_estimated['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end
B_X_test_estimated['days_from_apex'] = (B_X_test_estimated['date_forecast'].dt.dayofyear - 171).abs()
B_X_test_estimated['days_from_apex'] = B_X_test_estimated['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end
C_X_test_estimated['days_from_apex'] = (C_X_test_estimated['date_forecast'].dt.dayofyear - 171).abs()
C_X_test_estimated['days_from_apex'] = C_X_test_estimated['days_from_apex'].apply(lambda x: min(x, 365 - x))  # Correct for year-end

# Define the apex time (11:00:00)
apex_time = pd.to_datetime('11:00:00').time()

# Calculate the time difference from the apex time
A_X_TRAIN['hours_from_apex'] = (abs(A_X_TRAIN['date_forecast'].dt.hour - apex_time.hour)) % 24
B_X_TRAIN['hours_from_apex'] = (abs(B_X_TRAIN['date_forecast'].dt.hour - apex_time.hour)) % 24
C_X_TRAIN['hours_from_apex'] = (abs(C_X_TRAIN['date_forecast'].dt.hour - apex_time.hour)) % 24

A_X_test_estimated['hours_from_apex'] = (abs(A_X_test_estimated['date_forecast'].dt.hour - apex_time.hour)) % 24
B_X_test_estimated['hours_from_apex'] = (abs(B_X_test_estimated['date_forecast'].dt.hour - apex_time.hour)) % 24
C_X_test_estimated['hours_from_apex'] = (abs(C_X_test_estimated['date_forecast'].dt.hour - apex_time.hour)) % 24

In [12]:
# Convert 'date_forecast' column to string
A_X_TRAIN['date_forecast'] = A_X_TRAIN['date_forecast'].astype(str)
B_X_TRAIN['date_forecast'] = B_X_TRAIN['date_forecast'].astype(str)
C_X_TRAIN['date_forecast'] = C_X_TRAIN['date_forecast'].astype(str)

# Convert 'date_forecast' column to string
A_X_test_estimated['date_forecast'] = A_X_test_estimated['date_forecast'].astype(str)
B_X_test_estimated['date_forecast'] = B_X_test_estimated['date_forecast'].astype(str)
C_X_test_estimated['date_forecast'] = C_X_test_estimated['date_forecast'].astype(str)

# Convert 'time' column to string
A_train_targets['time'] = A_train_targets['time'].astype(str)
B_train_targets['time'] = B_train_targets['time'].astype(str)
C_train_targets['time'] = C_train_targets['time'].astype(str)

A_X_TRAIN.columns

Index(['date_forecast', 'absolute_humidity_2m:gm3_0',
       'absolute_humidity_2m:gm3_15', 'absolute_humidity_2m:gm3_30',
       'absolute_humidity_2m:gm3_45', 'clear_sky_energy_1h:J_0',
       'clear_sky_energy_1h:J_15', 'clear_sky_energy_1h:J_30',
       'clear_sky_energy_1h:J_45', 'clear_sky_rad:W_0', 'clear_sky_rad:W_15',
       'clear_sky_rad:W_30', 'clear_sky_rad:W_45', 'cloud_base_agl:m_0',
       'cloud_base_agl:m_15', 'cloud_base_agl:m_30', 'cloud_base_agl:m_45',
       'dew_point_2m:K_0', 'dew_point_2m:K_15', 'dew_point_2m:K_30',
       'dew_point_2m:K_45', 'diffuse_rad:W_0', 'diffuse_rad:W_15',
       'diffuse_rad:W_30', 'diffuse_rad:W_45', 'effective_cloud_cover:p_0',
       'effective_cloud_cover:p_15', 'effective_cloud_cover:p_30',
       'effective_cloud_cover:p_45', 'sfc_pressure:hPa_0',
       'sfc_pressure:hPa_15', 'sfc_pressure:hPa_30', 'sfc_pressure:hPa_45',
       'sun_azimuth:d_0', 'sun_azimuth:d_15', 'sun_azimuth:d_30',
       'sun_azimuth:d_45', 'sun_elevation_

In [13]:

A_X_AND_Y_TRAIN = pd.merge(A_X_TRAIN, A_train_targets, left_on=['date_forecast', 'solar_pannel_location'], right_on=['time', 'solar_pannel_location'], how='inner')
B_X_AND_Y_TRAIN = pd.merge(B_X_TRAIN, B_train_targets, left_on=['date_forecast', 'solar_pannel_location'], right_on=['time', 'solar_pannel_location'], how='inner')
C_X_AND_Y_TRAIN = pd.merge(C_X_TRAIN, C_train_targets, left_on=['date_forecast', 'solar_pannel_location'], right_on=['time', 'solar_pannel_location'], how='inner')
A_X_AND_Y_TRAIN.columns

Index(['date_forecast', 'absolute_humidity_2m:gm3_0',
       'absolute_humidity_2m:gm3_15', 'absolute_humidity_2m:gm3_30',
       'absolute_humidity_2m:gm3_45', 'clear_sky_energy_1h:J_0',
       'clear_sky_energy_1h:J_15', 'clear_sky_energy_1h:J_30',
       'clear_sky_energy_1h:J_45', 'clear_sky_rad:W_0', 'clear_sky_rad:W_15',
       'clear_sky_rad:W_30', 'clear_sky_rad:W_45', 'cloud_base_agl:m_0',
       'cloud_base_agl:m_15', 'cloud_base_agl:m_30', 'cloud_base_agl:m_45',
       'dew_point_2m:K_0', 'dew_point_2m:K_15', 'dew_point_2m:K_30',
       'dew_point_2m:K_45', 'diffuse_rad:W_0', 'diffuse_rad:W_15',
       'diffuse_rad:W_30', 'diffuse_rad:W_45', 'effective_cloud_cover:p_0',
       'effective_cloud_cover:p_15', 'effective_cloud_cover:p_30',
       'effective_cloud_cover:p_45', 'sfc_pressure:hPa_0',
       'sfc_pressure:hPa_15', 'sfc_pressure:hPa_30', 'sfc_pressure:hPa_45',
       'sun_azimuth:d_0', 'sun_azimuth:d_15', 'sun_azimuth:d_30',
       'sun_azimuth:d_45', 'sun_elevation_

In [14]:
#Create a x_train_to_predict dataframe with the data to predict: 
A_X_test_estimated['date_forecast'] = A_X_test_estimated['date_forecast'].astype(str)
B_X_test_estimated['date_forecast'] = B_X_test_estimated['date_forecast'].astype(str)
C_X_test_estimated['date_forecast'] = C_X_test_estimated['date_forecast'].astype(str)
y_predictons['time'] = y_predictons['time'].astype(str)


A_X_test_estimated = pd.merge(A_X_test_estimated, y_predictons, left_on=['solar_pannel_location', 'date_forecast'], right_on=['location', 'time'], how='inner')
B_X_test_estimated = pd.merge(B_X_test_estimated, y_predictons, left_on=['solar_pannel_location', 'date_forecast'], right_on=['location', 'time'], how='inner')
C_X_test_estimated = pd.merge(C_X_test_estimated, y_predictons, left_on=['solar_pannel_location', 'date_forecast'], right_on=['location', 'time'], how='inner')


A_X_TEST_ENCODED = A_X_test_estimated.copy()
A_X_TEST_ENCODED.set_index('id', inplace=True)

B_X_TEST_ENCODED = B_X_test_estimated.copy()
B_X_TEST_ENCODED.set_index('id', inplace=True)

C_X_TEST_ENCODED = C_X_test_estimated.copy()
C_X_TEST_ENCODED.set_index('id', inplace=True)




B_X_TEST_ENCODED.head(5)


Unnamed: 0_level_0,date_forecast,absolute_humidity_2m:gm3_0,absolute_humidity_2m:gm3_15,absolute_humidity_2m:gm3_30,absolute_humidity_2m:gm3_45,clear_sky_energy_1h:J_0,clear_sky_energy_1h:J_15,clear_sky_energy_1h:J_30,clear_sky_energy_1h:J_45,clear_sky_rad:W_0,clear_sky_rad:W_15,clear_sky_rad:W_30,clear_sky_rad:W_45,cloud_base_agl:m_0,cloud_base_agl:m_15,cloud_base_agl:m_30,cloud_base_agl:m_45,dew_point_2m:K_0,dew_point_2m:K_15,dew_point_2m:K_30,dew_point_2m:K_45,diffuse_rad:W_0,diffuse_rad:W_15,diffuse_rad:W_30,diffuse_rad:W_45,effective_cloud_cover:p_0,effective_cloud_cover:p_15,effective_cloud_cover:p_30,effective_cloud_cover:p_45,sfc_pressure:hPa_0,sfc_pressure:hPa_15,sfc_pressure:hPa_30,sfc_pressure:hPa_45,sun_azimuth:d_0,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_elevation_is_day_0,sun_elevation_is_day_15,sun_elevation_is_day_30,sun_elevation_is_day_45,t_1000hPa:K_0,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,visibility:m_0,visibility:m_15,visibility:m_30,visibility:m_45,wind_speed_10m:ms_0,wind_speed_10m:ms_15,wind_speed_10m:ms_30,wind_speed_10m:ms_45,direct_rad:W_max,solar_pannel_location,days_from_apex,hours_from_apex,time,prediction,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
720,2023-05-01 00:00:00,4.3,4.3,4.3,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1039.900024,1053.099976,1066.400024,1079.599976,271.700012,271.700012,271.600006,271.600006,0.0,0.0,0.0,0.0,80.699997,77.099998,73.199997,69.0,1013.200012,1013.0,1012.900024,1012.799988,10.521,14.204,17.870001,21.514,-0.0,-0.0,-0.0,-0.0,273.799988,273.799988,273.799988,273.799988,31329.5,31181.599609,31033.599609,30885.699219,4.0,4.0,3.9,3.9,0.0,B,50,11,2023-05-01 00:00:00,0,B
721,2023-05-01 01:00:00,4.3,4.3,4.2,4.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1092.900024,1080.099976,1067.300049,1054.5,271.600006,271.5,271.399994,271.299988,0.0,0.0,0.0,0.0,64.599998,74.099998,82.300003,89.0,1012.700012,1012.5,1012.400024,1012.299988,25.135,28.73,32.294998,35.831001,-0.0,-0.0,-0.0,-0.0,273.799988,273.799988,273.799988,273.799988,30737.800781,30519.0,30300.300781,30081.699219,3.9,3.8,3.8,3.8,0.0,B,50,10,2023-05-01 01:00:00,0,B
722,2023-05-01 02:00:00,4.2,4.2,4.1,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1041.699951,1147.300049,1252.900024,1358.5,271.200012,271.100006,271.0,270.899994,0.0,0.0,0.0,0.0,94.300003,90.699997,86.300003,81.099998,1012.200012,1012.0,1011.900024,1011.700012,39.335999,42.810001,46.255001,49.669998,-0.0,-0.0,-0.0,-0.0,273.799988,273.799988,273.899994,273.899994,29863.199219,30849.400391,31835.699219,32822.300781,3.7,3.7,3.6,3.6,0.0,B,50,9,2023-05-01 02:00:00,0,B
723,2023-05-01 03:00:00,4.1,4.0,4.0,4.0,0.0,1331.099976,10260.400391,28918.699219,0.0,4.8,14.9,27.0,1464.0,1274.0,1084.0,894.0,270.799988,270.700012,270.600006,270.5,0.0,3.1,12.9,21.5,75.099998,70.800003,66.599998,62.599998,1011.599976,1011.5,1011.299988,1011.200012,53.058998,56.422001,59.764,63.088001,-0.0,0.0,2.101,3.577,273.899994,273.899994,273.899994,273.899994,33809.0,34253.300781,34697.398438,35142.398438,3.6,3.5,3.5,3.4,5.8,B,50,8,2023-05-01 03:00:00,0,B
724,2023-05-01 04:00:00,3.9,3.9,3.9,3.9,59786.398438,106073.203125,164697.703125,236499.796875,43.0,63.299999,87.199997,114.099998,703.900024,847.400024,990.900024,1134.400024,270.299988,270.399994,270.399994,270.399994,31.4,42.099998,52.299999,63.799999,58.700001,63.799999,68.900002,74.0,1011.099976,1011.0,1010.900024,1010.799988,66.397003,69.695999,72.989998,76.281998,5.098,6.657,8.25,9.871,273.899994,273.899994,273.899994,274.0,35603.398438,35241.898438,34891.101562,34539.898438,3.4,3.3,3.3,3.3,42.5,B,50,7,2023-05-01 04:00:00,0,B


In [15]:

A_X_TEST_ENCODED = A_X_TEST_ENCODED.drop(['date_forecast','location', 'prediction'], axis=1)
B_X_TEST_ENCODED = B_X_TEST_ENCODED.drop(['date_forecast','location', 'prediction'], axis=1)
C_X_TEST_ENCODED = C_X_TEST_ENCODED.drop(['date_forecast','location', 'prediction'], axis=1)


#Drop all rows in training data with no tarhet value:
A_X_AND_Y_TRAIN = A_X_AND_Y_TRAIN.dropna(subset=['pv_measurement'])
B_X_AND_Y_TRAIN = B_X_AND_Y_TRAIN.dropna(subset=['pv_measurement'])
C_X_AND_Y_TRAIN = C_X_AND_Y_TRAIN.dropna(subset=['pv_measurement'])

# Load our

In [16]:
%pip install PyWavelets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data.data_fetcher import get_raw_data
from src.features.preprocess_data import fetch_preprocessed_data, get_preprocessed_test_data

from src.visualization.plotting import plot_correlation_matrix


train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()
X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
X_test_est_combined = get_preprocessed_test_data()


x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined, X_train_est_combined, X_val_est_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined, y_train_est_combined, y_val_est_combined])
estimated_data = pd.concat([X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined])
# x_whole.reset_index(drop=True, inplace=True)
# y_whole.reset_index(drop=True, inplace=True)
pd.set_option('display.max_columns', None)


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
train_observed contains: time                                  0
pv_measurement                     6064
date_forecast                     10999
absolute_humidity_2m:gm3          10999
air_density_2m:kgm3               10999
clear_sky_energy_1h:J             10999
clear_sky_rad:W                   10999
cloud_base_agl:m                      0
dew_or_rime:idx                   10999
dew_point_2m:K                    10999
diffuse_rad:W                     10999
diffuse_rad_1h:J                  10999
direct_rad:W                      10999
direct_rad_1h:J                   10999
effective_cloud_cover:p           10999
fresh_snow_12h:cm                 10999
fresh_snow_1h:cm                  10999
fresh_snow_24h:cm                 10999
fresh_snow_3h:cm                  10999
fresh_snow_6h:cm                  10999
is_day:idx                        10999
is_in_shadow:idx                  10999
msl_pressure:hPa             

ValueError: NaTType does not support timetuple

# Differences in trainings data

In [None]:
x_whole.describe()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,is_day:idx,is_in_shadow:idx,precip_5min:mm,precip_type_5min:idx,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sun_azimuth:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour,sun_product,modified_solar_elevation,effective_radiation,time_since_prediction,cloud_ratio,cloud_cover_over_30%,sun_addition,direct_rad_cloud_interaction,modified_solar_elevation_squared,is_freezing,is_snow,is_rain
count,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0,77044.0
mean,6.241166,1.25229,535037.6,148.533371,1639.446899,0.007885,275.821411,39.974041,144098.9,52.259331,188197.2,66.943527,0.482267,0.565437,0.005829,0.084221,1001.689941,0.744772,0.010283,73.923294,179.718307,0.059241,279.985352,73.753952,32999.046875,3.012385,0.712736,0.666015,3.4e-05,0.384871,0.326034,0.289095,-0.0113,0.027707,-0.005012,0.01618773,6231.929688,0.155913,0.162052,0.0,0.863935,0.805176,92.23336,3015.051025,0.252298,0.124448,0.102967,0.099021
std,2.728918,0.036447,843304.5,234.835953,1794.949219,0.237469,6.771035,61.64719,219356.5,116.097191,412553.2,34.011616,0.485604,0.48336,0.030001,0.322178,12.852131,5.278304,0.042884,14.194102,98.448479,0.107867,6.549464,33.702427,17984.40625,1.743064,2.751968,1.884144,0.006123,0.486568,0.468763,0.453345,0.708126,0.705461,0.709319,0.7046936,14842.071289,0.225102,0.261199,0.0,0.230534,0.396068,159.890182,8610.323242,0.303743,0.330095,0.303918,0.298693
min,0.5,1.13925,0.0,0.0,0.0,-1.0,247.425003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,935.75,0.0,0.0,19.575001,6.983,0.0,258.024994,0.0,132.375,0.025,-7.225,-8.4,-0.1,0.0,0.0,0.0,-0.999999,-0.999979,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.2,1.2275,0.0,0.0,512.737503,0.0,271.299988,0.0,0.0,0.0,0.0,42.150002,0.0,0.0,0.0,0.0,993.900024,0.0,0.0,64.599998,93.378752,0.0,275.399994,53.700001,16443.712402,1.66875,-1.3,-0.6,0.0,0.0,0.0,0.0,-0.726225,-0.673275,-0.707107,-0.7071068,0.0,0.0,0.0,0.0,0.809969,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.7,1.251,10084.28,1.55,1109.150024,0.0,275.725006,0.85,9893.438,0.0,0.0,79.362503,0.25,1.0,0.0,0.0,1002.700012,0.0,0.0,76.337502,180.095375,0.0,279.375,92.599998,37146.837891,2.675,0.35,0.7,0.0,0.0,0.0,0.0,-0.004301,0.052658,0.0,6.123234000000001e-17,0.0,0.0,0.0,0.0,0.9915,1.0,0.875,0.0,0.0,0.0,0.0,0.0
75%,8.075001,1.27525,851276.5,234.399994,2000.993744,0.0,281.0,66.425003,238933.4,31.556251,122187.5,98.324997,1.0,1.0,0.0,0.0,1010.650024,0.0,0.0,85.325005,266.856567,0.1,284.5,99.850006,48388.320312,4.0,2.5,1.85,0.0,1.0,1.0,1.0,0.686676,0.735762,0.707107,0.7071068,2214.619385,0.278437,0.265127,0.0,1.0,1.0,119.275002,482.624847,0.527671,0.0,0.0,0.0
max,17.35,1.441,2990596.0,835.650024,11673.625,1.0,293.625,334.75,1198315.0,683.400024,2441810.0,100.0,1.0,1.0,0.6225,5.0,1037.25,96.775002,1.1,100.0,348.487518,1.375,303.25,100.0,72160.703125,13.275,11.2,8.825,0.1,1.0,1.0,1.0,0.999986,1.0,1.0,1.0,104149.828125,0.765413,3.270251,0.0,1.0,1.0,773.275024,68032.5,0.874879,1.0,1.0,1.0


In [None]:
other_train = pd.concat([A_X_AND_Y_TRAIN, B_X_AND_Y_TRAIN, C_X_AND_Y_TRAIN])
other_train.describe()


Unnamed: 0,absolute_humidity_2m:gm3_0,absolute_humidity_2m:gm3_15,absolute_humidity_2m:gm3_30,absolute_humidity_2m:gm3_45,clear_sky_energy_1h:J_0,clear_sky_energy_1h:J_15,clear_sky_energy_1h:J_30,clear_sky_energy_1h:J_45,clear_sky_rad:W_0,clear_sky_rad:W_15,clear_sky_rad:W_30,clear_sky_rad:W_45,cloud_base_agl:m_0,cloud_base_agl:m_15,cloud_base_agl:m_30,cloud_base_agl:m_45,dew_point_2m:K_0,dew_point_2m:K_15,dew_point_2m:K_30,dew_point_2m:K_45,diffuse_rad:W_0,diffuse_rad:W_15,diffuse_rad:W_30,diffuse_rad:W_45,effective_cloud_cover:p_0,effective_cloud_cover:p_15,effective_cloud_cover:p_30,effective_cloud_cover:p_45,sfc_pressure:hPa_0,sfc_pressure:hPa_15,sfc_pressure:hPa_30,sfc_pressure:hPa_45,sun_azimuth:d_0,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_elevation_is_day_0,sun_elevation_is_day_15,sun_elevation_is_day_30,sun_elevation_is_day_45,t_1000hPa:K_0,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,visibility:m_0,visibility:m_15,visibility:m_30,visibility:m_45,wind_speed_10m:ms_0,wind_speed_10m:ms_15,wind_speed_10m:ms_30,wind_speed_10m:ms_45,direct_rad:W_max,days_from_apex,hours_from_apex,pv_measurement
count,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,81386.0,83142.0,83136.0,83127.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0,89736.0
mean,5.953523,5.953322,5.953226,5.953159,507115.2,507107.8,507093.8,507082.1,140.862686,140.849243,140.850327,140.851059,1695.173462,1739.129883,1737.034058,1734.952271,275.085907,275.08609,275.08609,275.085846,38.97765,38.896389,38.725594,38.901039,66.906639,66.99617,67.02491,66.993721,1007.935913,1007.935974,1007.935913,1007.935974,182.450546,180.720367,176.390564,179.161469,8.919071,8.922294,8.921758,8.918437,279.31604,279.315887,279.315491,279.315216,33197.964844,33196.117188,33193.707031,33190.753906,3.031766,3.03196,3.032194,3.032326,56.376804,97.122838,5.99756,292.99465
std,2.688669,2.686662,2.685953,2.68662,816350.1,816350.6,816353.1,816351.5,227.341827,227.347717,227.343414,227.338608,1782.952637,1812.47168,1796.203735,1804.134399,6.813097,6.809192,6.807894,6.809395,60.378723,60.464146,60.671577,60.457279,35.113735,34.500549,34.295116,34.497101,13.185225,13.182809,13.182015,13.182457,102.822189,102.914253,102.787804,102.736809,13.520752,13.518727,13.518906,13.52043,6.507071,6.503327,6.502243,6.503626,18292.039062,17969.498047,17861.337891,17971.714844,1.76814,1.753842,1.74871,1.753654,120.571304,51.539308,3.488807,779.495311
min,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.4,27.5,27.5,27.5,247.300003,247.5,247.399994,247.300003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,941.400024,941.599976,941.5,941.5,0.008,0.002,0.003,4.038,-0.0,-0.0,-0.0,-0.0,257.899994,258.0,258.0,258.0,130.600006,131.800003,133.0,134.100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
25%,4.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,577.799988,595.325012,598.375,595.400024,270.600006,270.600006,270.600006,270.600006,0.0,0.0,0.0,0.0,41.0,41.599998,41.599998,41.299999,999.799988,999.799988,999.799988,999.799988,93.0665,91.009003,87.676001,90.112999,-0.0,-0.0,-0.0,-0.0,274.799988,274.799988,274.799988,274.799988,16122.175049,16912.875488,17276.775391,16858.525391,1.7,1.7,1.7,1.7,0.0,55.0,3.0,0.0
50%,5.4,5.4,5.4,5.4,2899.65,2467.9,2599.0,3360.95,0.0,0.0,0.0,0.0,1135.699951,1168.099976,1173.800049,1164.5,274.799988,274.799988,274.799988,274.799988,0.0,0.0,0.0,0.0,80.699997,80.300003,79.900002,80.300003,1008.799988,1008.799988,1008.799988,1008.799988,179.551994,179.413498,179.570503,182.709,-0.0,-0.0,-0.0,-0.0,278.5,278.5,278.5,278.5,37486.851562,37178.75,36865.449219,37107.050781,2.7,2.7,2.7,2.7,0.0,100.0,6.0,0.0
75%,7.7,7.7,7.7,7.7,754988.6,767355.3,767527.8,745540.4,213.199997,207.325001,209.600006,213.399994,2022.050049,2074.300049,2082.625,2075.449951,280.299988,280.299988,280.299988,280.299988,64.900002,64.300003,63.299999,64.400002,99.300003,98.800003,98.599998,98.699997,1017.099976,1017.099976,1017.099976,1017.099976,271.384003,270.854256,265.995491,268.43026,15.1135,14.846,14.966,15.17025,283.799988,283.799988,283.799988,283.799988,48810.025391,48480.548828,48443.151367,48534.201172,4.1,4.1,4.1,4.1,40.400002,142.0,9.0,159.5625
max,17.5,17.4,17.4,17.5,3006697.0,2997998.0,3001742.0,3007722.0,835.299988,837.0,836.700012,834.299988,11688.700195,11678.700195,11668.599609,11658.5,293.799988,293.700012,293.700012,293.700012,340.100006,338.200012,345.700012,341.0,100.0,100.0,100.0,100.0,1043.800049,1043.699951,1043.699951,1043.699951,359.996002,359.998993,359.998993,355.540985,49.917999,50.036999,50.019001,49.855,303.299988,303.299988,303.299988,303.299988,76737.796875,75586.203125,75537.0,76123.0,15.2,13.9,12.6,13.6,689.799988,182.0,12.0,5733.42


In [None]:
print(x_whole.shape)
print(other_train.shape)

(77044, 48)
(89736, 60)


In [None]:
print(x_whole.columns.shape)
print(other_train.columns.shape)
x_whole.columns

(48,)
(60,)


Index(['absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'clear_sky_energy_1h:J', 'clear_sky_rad:W', 'cloud_base_agl:m',
       'dew_or_rime:idx', 'dew_point_2m:K', 'diffuse_rad:W',
       'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'is_day:idx', 'is_in_shadow:idx',
       'precip_5min:mm', 'precip_type_5min:idx', 'pressure_50m:hPa',
       'prob_rime:p', 'rain_water:kgm2', 'relative_humidity_1000hPa:p',
       'sun_azimuth:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms',
       'location_a', 'location_b', 'location_c', 'sin_day_of_year',
       'cos_day_of_year', 'sin_hour', 'cos_hour', 'sun_product',
       'modified_solar_elevation', 'effective_radiation',
       'time_since_prediction', 'cloud_ratio', 'cloud_cover_over_30%',
       'sun_addition', 'direct_rad_cloud_interaction',
  

# Differences in test data

In [None]:
X_test_est_combined.describe()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,is_day:idx,is_in_shadow:idx,precip_5min:mm,precip_type_5min:idx,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sun_azimuth:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour,sun_product,modified_solar_elevation,effective_radiation,time_since_prediction,cloud_ratio,cloud_cover_over_30%,sun_addition,direct_rad_cloud_interaction,modified_solar_elevation_squared,is_freezing,is_snow,is_rain
count,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0
mean,8.202084,1.232774,1227747.0,341.040741,1681.780151,0.036111,280.778992,84.755867,305120.1,114.341972,411630.6,64.090988,0.793519,0.241204,0.007663,0.065972,1008.982666,0.016435,0.013067,70.843506,179.180542,0.064525,284.749359,69.259583,33348.164062,2.945822,1.652465,-0.182477,0.000359,0.333333,0.333333,0.333333,0.431752,-0.855224,-1.562536e-17,-5.5099960000000005e-17,14912.679688,0.340791,0.262975,28.49425,0.871397,0.75,199.097855,7167.174316,0.487801,0.0,0.001852,0.08287
std,2.191513,0.032043,1101274.0,306.855469,2062.581299,0.182849,4.36416,78.432159,277755.5,171.784576,610705.1,37.460999,0.385461,0.413672,0.031816,0.23524,9.738119,0.46843,0.051688,15.604269,103.925461,0.109135,5.830846,37.764587,15336.857422,1.714616,2.569777,1.490788,0.005502,0.471514,0.471514,0.471514,0.252718,0.135475,0.7072705,0.7072705,23051.986328,0.278666,0.285936,6.923792,0.250668,0.433113,214.731079,13825.104492,0.320762,0.0,0.043003,0.27575
min,3.2,1.14275,0.0,0.0,0.0,-1.0,268.075012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,977.575012,0.0,0.0,24.599998,13.4395,0.0,273.700012,0.0,1532.175049,0.1,-4.3,-4.3,0.0,0.0,0.0,0.0,-0.006451,-0.999979,-1.0,-1.0,0.0,0.0,0.0,16.976944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.65,1.209437,48741.78,17.80625,402.631256,0.0,277.924988,7.975,34179.26,0.0,492.2562,30.131249,1.0,0.0,0.0,0.0,1003.75,0.0,0.0,60.275002,89.979689,0.0,279.849976,36.625001,19856.712891,1.55,-0.2,-1.3,0.0,0.0,0.0,0.0,0.232243,-0.972658,-0.7071068,-0.7071068,0.0,0.042327,0.001233,22.7325,0.886944,0.75,8.5125,0.0,0.205706,0.0,0.0,0.0
50%,8.05,1.238,933375.3,299.512512,852.362518,0.0,281.049988,75.224998,264930.8,13.475,65480.56,77.5,1.0,0.0,0.0,0.0,1012.25,0.0,0.0,73.587502,180.959869,0.0,284.799988,92.237499,37066.824219,2.75,1.6,-0.25,0.0,0.0,0.0,0.0,0.480303,-0.87706,6.123234000000001e-17,-6.123234000000001e-17,1170.831604,0.338697,0.113782,28.488056,1.0,1.0,119.412502,38.066195,0.581975,0.0,0.0,0.0
75%,10.0,1.2595,2276978.0,662.018738,2258.043701,0.0,284.375,134.418751,488068.5,187.493748,655836.9,99.974998,1.0,0.25,0.0,0.0,1016.200012,0.0,0.0,83.699997,271.257004,0.1,288.25,100.0,45345.832031,3.95,3.55,0.775,0.0,1.0,1.0,1.0,0.643337,-0.765584,0.7071068,0.7071068,23020.048828,0.627653,0.529628,34.243611,1.0,1.0,342.081245,6606.237305,0.792246,0.0,0.0,0.0
max,14.125,1.3005,2989497.0,835.525024,11467.799805,1.0,290.049988,307.599976,1099236.0,667.799988,2391037.0,100.0,1.0,1.0,0.3175,1.25,1022.5,17.5,0.525,98.400002,348.119263,0.6,302.125,100.0,63868.226562,9.1,9.05,3.8,0.1,1.0,1.0,1.0,0.880683,-0.473706,1.0,1.0,95239.476562,0.765388,1.577809,39.999167,1.0,1.0,765.550049,66292.5,0.874864,0.0,1.0,1.0


In [None]:
other_tests = pd.concat([A_X_test_estimated, B_X_test_estimated, C_X_test_estimated])
other_tests.describe()

Unnamed: 0,absolute_humidity_2m:gm3_0,absolute_humidity_2m:gm3_15,absolute_humidity_2m:gm3_30,absolute_humidity_2m:gm3_45,clear_sky_energy_1h:J_0,clear_sky_energy_1h:J_15,clear_sky_energy_1h:J_30,clear_sky_energy_1h:J_45,clear_sky_rad:W_0,clear_sky_rad:W_15,clear_sky_rad:W_30,clear_sky_rad:W_45,cloud_base_agl:m_0,cloud_base_agl:m_15,cloud_base_agl:m_30,cloud_base_agl:m_45,dew_point_2m:K_0,dew_point_2m:K_15,dew_point_2m:K_30,dew_point_2m:K_45,diffuse_rad:W_0,diffuse_rad:W_15,diffuse_rad:W_30,diffuse_rad:W_45,effective_cloud_cover:p_0,effective_cloud_cover:p_15,effective_cloud_cover:p_30,effective_cloud_cover:p_45,sfc_pressure:hPa_0,sfc_pressure:hPa_15,sfc_pressure:hPa_30,sfc_pressure:hPa_45,sun_azimuth:d_0,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_elevation_is_day_0,sun_elevation_is_day_15,sun_elevation_is_day_30,sun_elevation_is_day_45,t_1000hPa:K_0,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,visibility:m_0,visibility:m_15,visibility:m_30,visibility:m_45,wind_speed_10m:ms_0,wind_speed_10m:ms_15,wind_speed_10m:ms_30,wind_speed_10m:ms_45,direct_rad:W_max,days_from_apex,hours_from_apex,id,prediction
count,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,1879.0,1937.0,1937.0,1937.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0
mean,8.206482,8.203751,8.200879,8.197222,1227747.0,1227748.0,1227744.0,1227751.0,341.056641,341.017426,341.027771,341.061157,1797.160156,1882.277466,1868.517578,1854.756348,280.783203,280.781799,280.777832,280.773163,84.915688,84.771759,84.511574,84.82444,64.113792,64.143097,64.105553,64.00148,1015.070374,1015.052795,1015.034851,1015.019653,183.166199,178.425781,175.685333,179.444855,20.981165,20.999121,20.982639,20.991322,284.737732,284.746521,284.751953,284.761292,33304.636719,33332.203125,33361.90625,33393.902344,2.946759,2.94625,2.945278,2.945,127.607353,18.566667,6.0,1079.5,0.0
std,2.201396,2.194498,2.189956,2.188581,1104469.0,1104466.0,1104470.0,1104464.0,307.729095,307.772369,307.762604,307.724518,2046.394409,2119.177002,2074.760498,2062.21167,4.378817,4.369216,4.361317,4.359707,78.422508,78.761269,79.59977,78.715187,37.947498,37.571831,37.476212,37.674469,9.840412,9.839953,9.837538,9.836187,109.193207,109.361115,109.129372,109.135262,17.795698,17.775604,17.793594,17.783812,5.839595,5.833704,5.82964,5.833142,15624.633789,15395.229492,15317.976562,15397.651367,1.733865,1.719152,1.712798,1.721526,181.000122,12.473342,3.488883,623.682612,0.0
min,3.2,3.2,3.2,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.799999,30.200001,30.200001,30.200001,268.0,268.100006,268.100006,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,983.5,983.400024,983.400024,983.299988,8.27,0.048,2.25,5.716,-0.0,-0.0,-0.0,-0.0,273.700012,273.700012,273.700012,273.700012,874.400024,1424.800049,1400.400024,1137.300049,0.0,0.1,0.1,0.1,0.0,1.0,0.0,0.0,0.0
25%,6.6,6.6,6.6,6.7,64338.12,48387.37,52083.6,63123.4,13.65,16.4,16.75,12.7,486.899994,512.0,506.899994,508.100006,277.899994,277.899994,277.899994,278.0,6.925,6.575,8.325,7.1,30.700001,30.5,31.1,29.799999,1009.799988,1009.799988,1009.700012,1009.700012,85.359253,78.783001,78.7785,82.055502,1.36875,2.059,1.455,1.336,279.799988,279.899994,279.899994,279.899994,19635.100098,20313.749512,19487.049805,19225.750488,1.5,1.5,1.5,1.5,0.0,9.0,3.0,539.75,0.0
50%,8.0,8.0,8.1,8.1,1056303.0,949686.7,987528.3,1065608.0,273.849991,295.699997,293.049988,263.299988,997.799988,1056.900024,1043.699951,1057.199951,281.0,281.0,281.0,281.100006,73.700001,74.599998,74.599998,74.549999,77.75,78.599998,78.099998,77.299999,1018.299988,1018.299988,1018.299988,1018.25,184.236,179.280998,174.079506,179.142502,18.54,19.641001,19.5195,17.9905,284.799988,284.799988,284.799988,284.799988,37623.050781,37315.5,37144.300781,37311.25,2.7,2.8,2.8,2.7,23.4,16.5,6.0,1079.5,0.0
75%,10.0,10.0,10.0,10.0,2372038.0,2291194.0,2325298.0,2383465.0,646.874985,663.099976,659.849976,637.324997,2298.300049,2407.0,2435.699951,2390.399902,284.299988,284.399994,284.324989,284.299988,135.600006,134.0,131.699997,134.224995,100.0,100.0,100.0,100.0,1022.299988,1022.299988,1022.299988,1022.299988,279.576248,279.797508,272.95826,276.282494,38.102499,38.919751,38.83375,37.694501,288.299988,288.200012,288.299988,288.200012,45378.099609,45347.201172,45315.225586,45393.625,4.0,4.0,4.0,4.0,226.0,28.0,9.0,1619.25,0.0
max,14.2,14.2,14.1,14.1,3005707.0,2996150.0,3000894.0,3007376.0,835.099976,836.900024,836.400024,833.799988,11467.799805,11467.799805,11467.799805,11467.799805,290.200012,290.100006,290.0,290.100006,312.600006,306.100006,321.899994,317.899994,100.0,100.0,100.0,100.0,1028.699951,1028.699951,1028.699951,1028.699951,356.984009,359.924011,349.873993,353.417999,49.902,50.033001,50.007,49.824001,302.200012,302.100006,302.100006,302.200012,63863.800781,63871.199219,63871.800781,63866.101562,8.8,9.0,9.2,9.4,668.400024,50.0,12.0,2159.0,0.0


In [None]:
# Find amount of missing values for X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c
print("Estimated")
print(X_train_estimated_a.isnull().sum())
print(X_train_estimated_b.isnull().sum())
print(X_train_estimated_c.isnull().sum())
print("Observed")
print(X_train_observed_a.isnull().sum())
print(X_train_observed_b.isnull().sum())
print(X_train_observed_c.isnull().sum())
print("Test")
print(X_test_estimated_a.isnull().sum())
print(X_test_estimated_b.isnull().sum())
print(X_test_estimated_c.isnull().sum())


Estimated
date_calc                             0
date_forecast                         0
absolute_humidity_2m:gm3              0
air_density_2m:kgm3                   0
ceiling_height_agl:m               3919
clear_sky_energy_1h:J                 0
clear_sky_rad:W                       0
cloud_base_agl:m                   2094
dew_or_rime:idx                       0
dew_point_2m:K                        0
diffuse_rad:W                         0
diffuse_rad_1h:J                      0
direct_rad:W                          0
direct_rad_1h:J                       0
effective_cloud_cover:p               0
elevation:m                           0
fresh_snow_12h:cm                     0
fresh_snow_1h:cm                      0
fresh_snow_24h:cm                     0
fresh_snow_3h:cm                      0
fresh_snow_6h:cm                      0
is_day:idx                            0
is_in_shadow:idx                      0
msl_pressure:hPa                      0
precip_5min:mm                