# Team name: Group 6
* Full name: Sverre Nystad, StudentNr: 56882,  Kandidatnr: 10003, Kaggle Name: Sverre Nystad
* Full name: Gunnar Nystad, StudentNr: 527760, Kandidatnr: 10344, Kaggle Name: Gunnar Nystad
* Full name: Peter Skoland, StudentNr: 528091, Kandidatnr 10307,  Kaggle Name: Peter Skoland

# Catboost stack
This stack is this csv in on kaggle: "average cat cloud interaction 144.9 cat mod elevation squared 144.8.csv"

## Step 0: Importing libraries

In [1]:

import math
from datetime import datetime
from typing import List, Tuple
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)

%matplotlib inline
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from scipy.stats import skew

import warnings
warnings.filterwarnings('ignore')

## Step 0.1: Preprocessing of data

### Step 1.1: Loading data

In [2]:
PATH_RAW_DATA_LOCATION = "data/raw/"

def get_raw_data():
    """
    Utility function to load the raw data from the data/raw folder.

    Returns:
        train_a (pd.DataFrame): The training targets for the A dataset.
        train_b (pd.DataFrame): The training targets for the B dataset.
        train_c (pd.DataFrame): The training targets for the C dataset.
        X_train_estimated_a (pd.DataFrame): The estimated training features for the A dataset.
        X_train_estimated_b (pd.DataFrame): The estimated training features for the B dataset.
        X_train_estimated_c (pd.DataFrame): The estimated training features for the C dataset.
        X_train_observed_a (pd.DataFrame): The observed training features for the A dataset.
        X_train_observed_b (pd.DataFrame): The observed training features for the B dataset.
        X_train_observed_c (pd.DataFrame): The observed training features for the C dataset.
        X_test_estimated_a (pd.DataFrame): The estimated test features for the A dataset.
        X_test_estimated_b (pd.DataFrame): The estimated test features for the B dataset.
        X_test_estimated_c (pd.DataFrame): The estimated test features for the C dataset.
    """
    train_a = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}A/train_targets.parquet')
    train_b = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}B/train_targets.parquet')
    train_c = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}C/train_targets.parquet')
    X_train_estimated_a = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}A/X_train_estimated.parquet')
    X_train_estimated_b = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}B/X_train_estimated.parquet')
    X_train_estimated_c = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}C/X_train_estimated.parquet')
    X_train_observed_a = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}A/X_train_observed.parquet')
    X_train_observed_b = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}B/X_train_observed.parquet')
    X_train_observed_c = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}C/X_train_observed.parquet')
    X_test_estimated_a = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}A/X_test_estimated.parquet')
    X_test_estimated_b = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}B/X_test_estimated.parquet')
    X_test_estimated_c = pd.read_parquet(f'{PATH_RAW_DATA_LOCATION}C/X_test_estimated.parquet')

    return train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c


### Step 1.2: Feature engineering

In [3]:
def prepare_data(
    train_observed: pd.DataFrame,
    train_estimated: pd.DataFrame,
    test_size=0.2,
    random_state=42,
    drop_features: bool = True,
) -> Tuple[
    pd.DataFrame,
    pd.DataFrame,
    pd.Series,
    pd.Series,
    pd.DataFrame,
    pd.DataFrame,
    pd.Series,
    pd.Series,
]:
    """
    Prepares the data for modeling by handling missing values and splitting the data.

    Args:
    train_observed (pd.DataFrame): The aligned training DataFrame with observed features.
    train_estimated (pd.DataFrame): The aligned training DataFrame with estimated features.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before applying the split.

    Returns:
    X_train_obs (pd.DataFrame): The training features with observed data.
    X_val_obs (pd.DataFrame): The validation features with observed data.
    y_train_obs (pd.Series): The training target with observed data.
    y_val_obs (pd.Series): The validation target with observed data.
    X_train_est (pd.DataFrame): The training features with estimated data.
    X_val_est (pd.DataFrame): The validation features with estimated data.
    y_train_est (pd.Series): The training target with estimated data.
    y_val_est (pd.Series): The validation target with estimated data.
    """

    # Remove missing features
    train_observed = remove_missing_features(train_observed)
    train_estimated = remove_missing_features(train_estimated)

    # Handle missing values (e.g., imputation, removal)
    train_observed_clean = train_observed.dropna(
        subset=["visibility:m", "pv_measurement"]
    )
    train_estimated_clean = train_estimated.dropna(
        subset=["visibility:m", "pv_measurement"]
    )

    # Remove discrepancies
    train_observed_clean = clean_pv_data(train_observed_clean)
    train_estimated_clean = clean_pv_data(train_estimated_clean)

    # Feature engineer
    train_observed_clean = feature_engineer(train_observed_clean)
    train_estimated_clean = feature_engineer(train_estimated_clean)

    # Split the data into features (X) and target (y)
    y_obs = train_observed_clean["pv_measurement"]

    if drop_features:
        X_obs = train_observed_clean.drop(
            columns=["time", "pv_measurement", "date_forecast", "date_calc"],
            errors="ignore",
        )
    else:
        X_obs = train_observed_clean

    if drop_features:
        X_est = train_estimated_clean.drop(
            columns=["time", "pv_measurement", "date_calc", "date_forecast"],
            errors="ignore",
        )
    else:
        X_est = train_estimated_clean

    y_est = train_estimated_clean["pv_measurement"]

    # Split the data into training and validation sets
    X_train_obs, X_val_obs, y_train_obs, y_val_obs = train_test_split(
        X_obs, y_obs, test_size=test_size, random_state=random_state
    )
    X_train_est, X_val_est, y_train_est, y_val_est = train_test_split(
        X_est, y_est, test_size=test_size, random_state=random_state
    )

    return (
        X_train_obs,
        X_val_obs,
        y_train_obs,
        y_val_obs,
        X_train_est,
        X_val_est,
        y_train_est,
        y_val_est,
    )


def get_location_datasets(
    df: pd.DataFrame,
) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    locations = ["location_a", "location_b", "location_c"]
    x_a = df[df["location_a"] == 1]
    x_a = x_a.drop(locations, axis=1)
    y_a = x_a["pv_measurement"]
    if "pv_measurement" in x_a.columns:
        x_a = x_a.drop("pv_measurement", axis=1)

    x_b = df[df["location_b"] == 1]
    x_b = x_b.drop(locations, axis=1)
    y_b = x_b["pv_measurement"]
    if "pv_measurement" in x_b.columns:
        x_b = x_b.drop("pv_measurement", axis=1)

    x_c = df[df["location_c"] == 1]
    x_c = x_c.drop(locations, axis=1)
    y_c = x_c["pv_measurement"]
    if "pv_measurement" in x_b.columns:
        x_b = x_b.drop("pv_measurement", axis=1)

    return (x_a, x_b, x_c, y_a, y_b, y_c)


def remove_missing_features(df: pd.DataFrame) -> pd.DataFrame:
    # Remove features with more than 50% missing values or Constant features
    df = df.drop("snow_density:kgm3", axis=1)
    df = df.drop("ceiling_height_agl:m", axis=1)
    df = df.drop("elevation:m", axis=1)
    df["cloud_base_agl:m"] = df["cloud_base_agl:m"].fillna(0)
    return df


def clean_pv_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply a series of filters to clean PV data in the DataFrame.

    Args:
    df (pd.DataFrame): DataFrame containing PV measurement data.

    Returns:
    pd.DataFrame: Cleaned DataFrame after applying all filters.
    """
    df = filter_pv_measurements_at_night(df)
    df = filter_constant_pv_measurements(df)
    df = filter_zero_pv_measurements(df)
    return df


def filter_pv_measurements_at_night(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter out positive PV measurements at night based on specific conditions. As there are no PV measurements at night, these are likely to be measurement errors.

    Args:
    df (pd.DataFrame): DataFrame containing PV measurement and time-of-day data.

    Returns:
    pd.DataFrame: DataFrame with unrealistic positive PV measurements at night removed.
    """
    
    # Condition: Positive PV measurement when it's not daytime and the measurement is the same as the previous timestep
    night_time_with_positive_pv = (df["is_day:idx"] == 0) & (df["pv_measurement"] > 0)
    same_as_previous_step = df["pv_measurement"] == df["pv_measurement"].shift(1)
    condition1 = night_time_with_positive_pv & same_as_previous_step

    # Condition: Positive PV measurement when sun elevation is below a certain threshold
    sun_elevation_threshold = -10
    low_sun_elevation_with_positive_pv = (df["sun_elevation:d"] < sun_elevation_threshold) & (df["pv_measurement"] > 0)
    
    # Combined condition to filter
    conditions_to_remove = condition1 | low_sun_elevation_with_positive_pv
    df = df.drop(df[conditions_to_remove].index)

    return df


def filter_constant_pv_measurements(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter out rows where PV measurement is constant and non-zero for 6 or more consecutive timesteps.

    Args:
    df (pd.DataFrame): DataFrame containing PV measurement data.

    Returns:
    pd.DataFrame: DataFrame with specified discrepancies removed.
    """
    
    # Step 1: Mark changes in pv_measurement and zero values
    measurement_change_or_zero = (df["pv_measurement"] != df["pv_measurement"].shift()) | (df["pv_measurement"] == 0)
    
    # Step 2: Create groups for consecutive measurements
    df["temp_group"] = measurement_change_or_zero.cumsum()

    # Step 3: Count entries in each group
    group_counts = df.groupby("temp_group")["pv_measurement"].transform("count")

    # Step 4: Determine rows to remove (constant non-zero measurements for 6+ timesteps)
    rows_to_remove = (group_counts >= 6) & (df["pv_measurement"] != 0)

    # Step 5: Remove specified rows and the temporary grouping column
    df_filtered = df[~rows_to_remove].drop(columns=["temp_group"])
    
    return df_filtered



def filter_zero_pv_measurements(
    un_filtered_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Remove entries where PV measurements are zero despite significant radiation.

    Args:
    df (pd.DataFrame): DataFrame containing radiation and PV measurement data.

    Returns:
    pd.DataFrame: Filtered DataFrame.
    """
    # Trail and error on total_radiation_threshold, tried 0.5, 5 and 30
    total_radiation_threshold = 30
    is_significant_radiation = (un_filtered_df["diffuse_rad:W"] + un_filtered_df["direct_rad:W"]) >= total_radiation_threshold
    is_zero_pv_measurement = un_filtered_df["pv_measurement"] == 0
    filtered_df = un_filtered_df[~(is_significant_radiation & is_zero_pv_measurement)]
    return filtered_df


def feature_engineer(data_frame: pd.DataFrame) -> pd.DataFrame:
    data_frame = create_time_features_from_date(data_frame)
    data_frame["solar_radiation_interaction"] = data_frame["diffuse_rad:W"] * data_frame["direct_rad:W"]

    data_frame["effective_solar_elevation"] = np.where(
        data_frame["sun_elevation:d"] <= 0,
        0,
        np.sin(np.radians(data_frame["sun_elevation:d"])),
    )
    data_frame = data_frame.drop("sun_elevation:d", axis=1)

    data_frame["effective_radiation"] = np.where(
        data_frame["clear_sky_energy_1h:J"] == 0,
        0,  # or your specified value
        data_frame["direct_rad_1h:J"] / data_frame["clear_sky_energy_1h:J"],
    )

    data_frame["net_clear_sky_residual"] = (
        data_frame["clear_sky_rad:W"]
        - data_frame["direct_rad:W"]
        - data_frame["diffuse_rad:W"]
    )

    data_frame["cloud_ratio"] = np.where(
        data_frame["total_cloud_cover:p"] == 0,
        0,  # or your specified value
        data_frame["effective_cloud_cover:p"] / data_frame["total_cloud_cover:p"],
    )

    data_frame["low_cloud_diffuse_rad"] = data_frame[
        "diffuse_rad:W"
    ].where(data_frame["effective_cloud_cover:p"] < 0.3, 0)

    data_frame["cloud_cover_over_30%"] = np.where(
        data_frame["effective_cloud_cover:p"] > 30, 1, 0
    )

    data_frame["global_horizontal_irradiation"] = (
        data_frame["diffuse_rad:W"] + data_frame["direct_rad:W"]
    )

    data_frame["direct_rad_cloud_adjustment"] = data_frame["direct_rad:W"] * (
        100 - data_frame["effective_cloud_cover:p"]
    )

    data_frame["effective_solar_elevation_squared"] = (
        data_frame["effective_solar_elevation"] ** 0.5
    )
    
    snow_columns = [
        "snow_depth:cm",
        "fresh_snow_12h:cm",
        "fresh_snow_1h:cm",
        "fresh_snow_24h:cm",
        "fresh_snow_3h:cm",
        "fresh_snow_6h:cm",
    ]

    data_frame["is_freezing"] = (data_frame["t_1000hPa:K"] < 273).astype(int)

    data_frame["is_snow"] = (data_frame[snow_columns] > 0).any(axis=1).astype(int)
    data_frame["is_rain"] = (data_frame["precip_5min:mm"] > 0).astype(int)

    data_frame = data_frame.drop("snow_drift:idx", axis=1)

    return data_frame


def create_time_features_from_date(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a new data frame with new features from date_forecast column.
    This will create temporal features from date_forecast that are easier to learn by the model.
    It creates the following features: month, season, year, day_of_year, day_segment.
    All of the new features are int type.

    Args:
        df (pd.DataFrame): Data frame with date_forecast column.
    Returns:
        pd.DataFrame: Data frame copy with new features.

    """
    df["sin_day_of_year"] = df["date_forecast"].apply(get_sin_day)
    df["cos_day_of_year"] = df["date_forecast"].apply(get_cos_day)
    df["sin_hour"] = df["date_forecast"].apply(get_sin_hour)
    df["cos_hour"] = df["date_forecast"].apply(get_cos_hour)
    return df


def get_sin_hour(date: datetime) -> float:
    HOURS_OF_DAY = 24
    return math.sin(2 * math.pi * (date.hour) / HOURS_OF_DAY)


def get_cos_hour(date: datetime) -> float:
    HOURS_OF_DAY = 24
    return math.cos(2 * math.pi * (date.hour) / HOURS_OF_DAY)


def get_sin_day(date: datetime) -> float:
    DAY_OF_YEAR = 365.25  # Add 0.25 to account for leap years
    return math.sin(2 * math.pi * (date.timetuple().tm_yday - 1) / DAY_OF_YEAR)


def get_cos_day(date: datetime) -> float:
    DAY_OF_YEAR = 365.25  # Add 0.25 to account for leap years
    return math.cos(2 * math.pi * (date.timetuple().tm_yday - 1) / DAY_OF_YEAR)


def add_location(data_frame: pd.DataFrame, location: str):
    if location.lower() == "a":
        data_frame["location_a"] = 1
    else:
        data_frame["location_a"] = 0

    if location.lower() == "b":
        data_frame["location_b"] = 1
    else:
        data_frame["location_b"] = 0

    if location.lower() == "c":
        data_frame["location_c"] = 1
    else:
        data_frame["location_c"] = 0
    return data_frame


# Define a function to align the temporal resolution of the datasets
def temporal_alignment(
    train: pd.DataFrame, observed: pd.DataFrame, estimated: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Aligns the temporal resolution of the datasets by aggregating the 15-min interval weather data to hourly intervals.

    Args:
        train (pd.DataFrame): The training targets DataFrame.
        observed (pd.DataFrame): The observed training features DataFrame.
        estimated (pd.DataFrame): The estimated training features DataFrame.

    Returns:
        train_observed (pd.DataFrame): The aligned training DataFrame with observed features.
        train_estimated (pd.DataFrame): The aligned training DataFrame with estimated features.
    """
    # Convert the time columns to datetime objects
    train["time"] = pd.to_datetime(train["time"])
    observed["date_forecast"] = pd.to_datetime(observed["date_forecast"])
    estimated["date_forecast"] = pd.to_datetime(estimated["date_forecast"])

    # Set the date_forecast column as index for resampling
    observed.set_index("date_forecast", inplace=True)
    estimated.set_index("date_forecast", inplace=True)

    # Resample the weather data to hourly intervals and aggregate the values by mean
    observed_resampled = observed.resample("1H").mean()
    estimated_resampled = estimated.resample("1H").mean()

    # Reset the index after resampling
    observed_resampled.reset_index(inplace=True)
    estimated_resampled.reset_index(inplace=True)

    # Merge the aggregated weather data with the solar production data based on the timestamp
    train_observed = pd.merge(
        train, observed_resampled, how="left", left_on="time", right_on="date_forecast"
    )
    train_estimated = pd.merge(
        train, estimated_resampled, how="left", left_on="time", right_on="date_forecast"
    )

    return train_observed, train_estimated


def temporal_alignment_tests(test: pd.DataFrame) -> Tuple[pd.DataFrame]:
    return aggregate_rows(test)


def aggregate_rows(df: pd.DataFrame) -> pd.DataFrame:
    # Create a 'group' column to group every 4 rows together
    df["group"] = df.index // 4

    # Define the aggregation functions
    aggregation = {col: "mean" for col in df.columns if col != "date_forecast"}
    aggregation["date_forecast"] = "first"

    # Group by the 'group' column and aggregate
    df_agg = df.groupby("group").agg(aggregation).reset_index(drop=True)

    # Drop the 'group' column from the original dataframe
    df_agg.drop("group", axis=1, inplace=True)

    return df_agg


In [4]:

def fetch_preprocessed_data(drop_features: bool = True) -> (
    Tuple[
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
    ]
):
    """
    Fetch the preprocessed data for training and validation.

    Returns:
        X_train_obs_combined: The observed data for training
        X_val_obs_combined: The observed data for validation
        y_train_obs_combined: The observed labels for training
        y_val_obs_combined: The observed labels for validation
        X_train_est_combined: The estimated data for training
        X_val_est_combined: The estimated data for validation
        y_train_est_combined: The estimated labels for training
        y_val_est_combined: The estimated labels for validation
    """
    (
        train_a,
        train_b,
        train_c,
        X_train_estimated_a,
        X_train_estimated_b,
        X_train_estimated_c,
        X_train_observed_a,
        X_train_observed_b,
        X_train_observed_c,
        _,
        _,
        _,
    ) = get_raw_data()

    # Temporally align the data from all three locations to the same time.
    train_observed_a, train_estimated_a = temporal_alignment(
        train_a, X_train_observed_a, X_train_estimated_a
    )
    train_observed_b, train_estimated_b = temporal_alignment(
        train_b, X_train_observed_b, X_train_estimated_b
    )
    train_observed_c, train_estimated_c = temporal_alignment(
        train_c, X_train_observed_c, X_train_estimated_c
    )

    # Add location data
    train_observed_a = add_location(train_observed_a, "a")
    train_estimated_a = add_location(train_estimated_a, "a")

    train_observed_b = add_location(train_observed_b, "b")
    train_estimated_b = add_location(train_estimated_b, "b")

    train_observed_c = add_location(train_observed_c, "c")
    train_estimated_c = add_location(train_estimated_c, "c")

    # Combine the temporally aligned datasets from all three locations
    train_observed_combined = pd.concat(
        [train_observed_a, train_observed_b, train_observed_c], ignore_index=True
    )
    train_estimated_combined = pd.concat(
        [train_estimated_a, train_estimated_b, train_estimated_c], ignore_index=True
    )

    # Prepare the combined dataset by handling missing values and splitting the data
    (
        X_train_obs_combined,
        X_val_obs_combined,
        y_train_obs_combined,
        y_val_obs_combined,
        X_train_est_combined,
        X_val_est_combined,
        y_train_est_combined,
        y_val_est_combined,
    ) = prepare_data(train_observed_combined, train_estimated_combined, drop_features=drop_features)

    return (
        X_train_obs_combined,
        X_val_obs_combined,
        y_train_obs_combined,
        y_val_obs_combined,
        X_train_est_combined,
        X_val_est_combined,
        y_train_est_combined,
        y_val_est_combined,
    )

def get_preprocessed_test_data() -> pd.DataFrame:
    """
    Get the preprocessed test data without the 'date_forecast' column.
    """
    (
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        X_test_estimated_a,
        X_test_estimated_b,
        X_test_estimated_c,
    ) = get_raw_data()

    # Align the test data to the same time as the training data
    X_test_estimated_a = temporal_alignment_tests(X_test_estimated_a)
    X_test_estimated_b = temporal_alignment_tests(X_test_estimated_b)
    X_test_estimated_c = temporal_alignment_tests(X_test_estimated_c)

    X_test_estimated_a = remove_missing_features(X_test_estimated_a)
    X_test_estimated_b = remove_missing_features(X_test_estimated_b)
    X_test_estimated_c = remove_missing_features(X_test_estimated_c)

    # Add location data
    X_test_estimated_a = add_location(X_test_estimated_a, "a")
    X_test_estimated_b = add_location(X_test_estimated_b, "b")
    X_test_estimated_c = add_location(X_test_estimated_c, "c")

    X_test_a_correct_features = feature_engineer(X_test_estimated_a)
    X_test_b_correct_features = feature_engineer(X_test_estimated_b)
    X_test_c_correct_features = feature_engineer(X_test_estimated_c)

    # Drop the 'date_calc' and 'date_forecast' columns from the test data
    X_test_estimated_a_processed = X_test_a_correct_features.drop(
        columns=["date_calc", "date_forecast"], errors='ignore'
    )
    X_test_estimated_b_processed = X_test_b_correct_features.drop(
        columns=["date_calc", "date_forecast"], errors='ignore'
    )
    X_test_estimated_c_processed = X_test_c_correct_features.drop(
        columns=["date_calc", "date_forecast"], errors='ignore'
    )

    tests = pd.concat([X_test_estimated_a_processed, X_test_estimated_b_processed, X_test_estimated_c_processed], ignore_index=True)

    return tests


In [5]:
X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
x_test_whole = get_preprocessed_test_data()

X_train_obs_combined["estimated_flag"] = 0
X_val_obs_combined["estimated_flag"] = 0
X_train_est_combined["estimated_flag"] = 1
X_val_est_combined["estimated_flag"] = 1
x_test_whole["estimated_flag"] = 1


x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined, X_train_est_combined, X_val_est_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined, y_train_est_combined, y_val_est_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

In [6]:
x_whole["pv_measurement"] = y_whole
df_shuffled = x_whole.sample(frac=1, random_state=42).reset_index(drop=True)
x_whole_a = df_shuffled[df_shuffled['location_a'] == 1]
x_whole_b = df_shuffled[df_shuffled['location_b'] == 1]
x_whole_c = df_shuffled[df_shuffled['location_c'] == 1]

y_whole_a = x_whole_a["pv_measurement"]
x_whole_a = x_whole_a.drop("pv_measurement", axis = 1)
x_whole_a = x_whole_a.drop('location_a', axis = 1)
x_whole_a = x_whole_a.drop('location_b', axis = 1)
x_whole_a = x_whole_a.drop('location_c', axis = 1)

y_whole_b = x_whole_b["pv_measurement"]
x_whole_b = x_whole_b.drop("pv_measurement", axis = 1)
x_whole_b = x_whole_b.drop('location_a', axis = 1)
x_whole_b = x_whole_b.drop('location_b', axis = 1)
x_whole_b = x_whole_b.drop('location_c', axis = 1)

y_whole_c = x_whole_c["pv_measurement"]
x_whole_c = x_whole_c.drop("pv_measurement", axis = 1)
x_whole_c = x_whole_c.drop('location_a', axis = 1)
x_whole_c = x_whole_c.drop('location_b', axis = 1)
x_whole_c = x_whole_c.drop('location_c', axis = 1)
cat_features = ["estimated_flag"]

### Helper functions

In [7]:
RES_PATH = 'results/output/'


def save_predictions(test: pd.DataFrame, filename: str) -> None:
    """
    Save the 'id' and 'prediction' columns of the test DataFrame to a CSV file.
    
    Parameters:
        test (pd.DataFrame): A 1D DataFrame containing only the predictions.
        filename (str): The name of the file where the predictions will be saved.
    """
    model = pd.DataFrame()
    
    model["prediction"] = test
    model['id'] = model.index

    model['prediction'] = model['prediction'].apply(lambda x: max(0, x))
    
    # Reorder the columns to ensure 'id' comes before 'prediction'
    model = model[['id', 'prediction']]
    

    # Save the resulting DataFrame to a CSV file
    model.to_csv(f'{RES_PATH}{filename}.csv', index=False)
    
    # Display the first few rows of the saved DataFrame
    print(model.head())

## Training

#### Model 1

In [8]:
best_model_a = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function="MAE",
    verbose = 100
)
best_model_a.fit(x_whole_a, y_whole_a)

0:	learn: 615.2289223	total: 251ms	remaining: 4m 10s
100:	learn: 201.4072982	total: 9.25s	remaining: 1m 22s
200:	learn: 186.0497580	total: 19s	remaining: 1m 15s
300:	learn: 181.1303390	total: 28.3s	remaining: 1m 5s
400:	learn: 177.2485579	total: 38.9s	remaining: 58.1s
500:	learn: 172.6623044	total: 49.2s	remaining: 49s
600:	learn: 163.7204714	total: 58.5s	remaining: 38.8s
700:	learn: 154.4498268	total: 1m 6s	remaining: 28.5s
800:	learn: 146.6204772	total: 1m 15s	remaining: 18.8s
900:	learn: 139.9470005	total: 1m 24s	remaining: 9.26s
999:	learn: 134.5556506	total: 1m 31s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ace6e51280>

In [9]:
best_model_b = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function="MAE",
    verbose = 100
)
best_model_b.fit(x_whole_b, y_whole_b)

0:	learn: 94.2791670	total: 138ms	remaining: 2m 17s
100:	learn: 25.1394341	total: 8.47s	remaining: 1m 15s
200:	learn: 21.7299397	total: 18.9s	remaining: 1m 15s
300:	learn: 20.2992389	total: 27.2s	remaining: 1m 3s
400:	learn: 19.0627633	total: 35.3s	remaining: 52.8s
500:	learn: 17.9527669	total: 43.3s	remaining: 43.1s
600:	learn: 17.1682651	total: 51.3s	remaining: 34s
700:	learn: 16.6932226	total: 59s	remaining: 25.1s
800:	learn: 15.8845163	total: 1m 6s	remaining: 16.6s
900:	learn: 15.3506132	total: 1m 14s	remaining: 8.21s
999:	learn: 14.9515366	total: 1m 22s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ace6e51190>

In [10]:
best_model_c = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function = "MAE",
    verbose = 100
)
best_model_c.fit(x_whole_c, y_whole_c)

0:	learn: 78.8208115	total: 93.2ms	remaining: 1m 33s
100:	learn: 21.4396391	total: 8.3s	remaining: 1m 13s
200:	learn: 18.5151414	total: 16.4s	remaining: 1m 5s
300:	learn: 17.0987782	total: 24.1s	remaining: 56.1s
400:	learn: 16.0406782	total: 32.3s	remaining: 48.2s
500:	learn: 15.0533016	total: 40.4s	remaining: 40.2s
600:	learn: 14.2314945	total: 49s	remaining: 32.6s
700:	learn: 13.6329705	total: 57.5s	remaining: 24.5s
800:	learn: 13.1507093	total: 1m 7s	remaining: 16.8s
900:	learn: 12.7614489	total: 1m 16s	remaining: 8.39s
999:	learn: 12.4840959	total: 1m 24s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ace6e51100>

##### Postprocessing

In [11]:

find_time_sin = lambda hour: math.sin(2 * math.pi * (hour) / 24)
find_time_cos = lambda hour: math.cos(2 * math.pi * (hour) / 24)

def postprocess_data(x_test: pd.DataFrame, y_pred: pd.DataFrame) -> pd.DataFrame:
    """Postprocess the data to set the predicted values to 0 at the correct times."""
    
    # Cap the min and max values for each location for each hour
    y_pred = cap_min_max_values(x_test, y_pred)

    # Set the predicted values to 0 at the correct times, the hours 22, 23 and 0 are set to zero as the PV measurements almost always are 0 at these times
    y_pred = set_0_pv_at_times(x_test, y_pred, "a", [22, 23, 0])
    y_pred = set_0_pv_at_times(x_test, y_pred, "b", [22, 23, 0])
    y_pred = set_0_pv_at_times(x_test, y_pred, "c", [22, 23, 0])

    return y_pred

def cap_min_max_values(x_test: pd.DataFrame, y_pred: pd.DataFrame) -> pd.DataFrame:
    """Cap the min and max values for each location for each hour."""
    for hour in range(24):
        # Get the min and max values for each location for each hour
        min_value_a, max_value_a = get_min_max_values_for_location_at_hour("a", hour)
        min_value_b, max_value_b = get_min_max_values_for_location_at_hour("b", hour)
        min_value_c, max_value_c = get_min_max_values_for_location_at_hour("c", hour)
        print(f"hour: {hour}, min_value_a: {min_value_a}, max_value_a: {max_value_a}, min_value_b: {min_value_b}, max_value_b: {max_value_b}, min_value_c: {min_value_c}, max_value_c: {max_value_c}")
        # Cap the values between min_value and max_value
        y_pred = cap_min_max_values_for_hour(x_test, y_pred, "a", hour, min_value_a, max_value_a)
        y_pred = cap_min_max_values_for_hour(x_test, y_pred, "b", hour, min_value_b, max_value_b)
        y_pred = cap_min_max_values_for_hour(x_test, y_pred, "c", hour, min_value_c, max_value_c)
    return y_pred

X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data(drop_features=False)
x_whole_with_time = pd.concat([X_train_obs_combined, X_val_obs_combined, X_train_est_combined, X_val_est_combined])

def get_min_max_values_for_location_at_hour(location: str, hour: int) -> tuple[float, float]:
    """Get the min and max values for a specific location at a specific hour."""
    # Get the x and y for the given hour and location
    hour_sin = find_time_sin(hour)
    hour_cos = find_time_cos(hour)
    # find the min and max values for the given hour and location
    min_value = x_whole_with_time[(x_whole_with_time["location_" + location] == 1) & (x_whole_with_time["sin_hour"] == hour_sin) & (x_whole_with_time["cos_hour"] == hour_cos)]["pv_measurement"].min()
    max_value = x_whole_with_time[(x_whole_with_time["location_" + location] == 1) & (x_whole_with_time["sin_hour"] == hour_sin) & (x_whole_with_time["cos_hour"] == hour_cos)]["pv_measurement"].max()
    
    return (min_value, max_value)

def cap_min_max_values_for_hour(x_test: pd.DataFrame, y_pred: pd.DataFrame, location: str, hour: int, min_value: float, max_value: float) -> pd.DataFrame:
    """Cap the min and max values for a specific hour."""
    
    # Calculate sin and cos values for the given hour
    hour_sin = find_time_sin(hour)
    hour_cos = find_time_cos(hour)
    
    # Find indices corresponding to the given hour at the given location
    indices = x_test[(x_test["location_" + location] == 1) & (x_test["sin_hour"] == hour_sin) & (x_test["cos_hour"] == hour_cos)].index
    
    # Cap the values between min_value and max_value
    y_pred.loc[indices] = y_pred.loc[indices].clip(min_value, max_value)
    
    return y_pred

def set_0_pv_at_times(x_test: pd.DataFrame, y_pred: pd.DataFrame, location: str, hours: list[int]) -> pd.DataFrame:
    """Find the correct predicted values at the given times and locaiton and set them to 0."""
    hours_to_set_0_sin = [find_time_sin(hour) for hour in hours]
    hours_to_set_0_cos = [find_time_cos(hour) for hour in hours]


    indices = x_test[(x_test["location_" + location] == 1) & (x_test["sin_hour"].isin(hours_to_set_0_sin) & (x_test["cos_hour"].isin(hours_to_set_0_cos)))].index
    for index in indices:
        y_pred.loc[index] = 0
    return y_pred


##### Predictions

In [12]:
x_whole_a = x_test_whole[x_test_whole['location_a'] == 1]
x_whole_b = x_test_whole[x_test_whole['location_b'] == 1]
x_whole_c = x_test_whole[x_test_whole['location_c'] == 1]

x_whole_a = x_whole_a.drop('location_a', axis = 1)
x_whole_a = x_whole_a.drop('location_b', axis = 1)
x_whole_a = x_whole_a.drop('location_c', axis = 1)

x_whole_b = x_whole_b.drop('location_a', axis = 1)
x_whole_b = x_whole_b.drop('location_b', axis = 1)
x_whole_b = x_whole_b.drop('location_c', axis = 1)

x_whole_c = x_whole_c.drop('location_a', axis = 1)
x_whole_c = x_whole_c.drop('location_b', axis = 1)
x_whole_c = x_whole_c.drop('location_c', axis = 1)

In [13]:
y_predictions_a = best_model_a.predict(x_whole_a)
y_predictions_b = best_model_b.predict(x_whole_b)
y_predictions_c = best_model_c.predict(x_whole_c)
y_predictions = pd.concat([pd.Series(y_predictions_a), pd.Series(y_predictions_b), pd.Series(y_predictions_c)])
y_predictions = y_predictions.reset_index(drop=True)

# Save the model
y_predictions_catboost_1 = postprocess_data(x_test_whole, pd.DataFrame(y_predictions))

hour: 0, min_value_a: 0.0, max_value_a: 3.3, min_value_b: -0.0, max_value_b: -0.0, min_value_c: 0.0, max_value_c: 0.0
hour: 1, min_value_a: 0.0, max_value_a: 53.68, min_value_b: -0.0, max_value_b: 12.075, min_value_c: 0.0, max_value_c: 9.8
hour: 2, min_value_a: 0.0, max_value_a: 233.64000000000001, min_value_b: -0.0, max_value_b: 68.1375, min_value_c: 0.0, max_value_c: 39.2
hour: 3, min_value_a: 0.0, max_value_a: 439.12, min_value_b: -0.0, max_value_b: 138.0, min_value_c: 0.0, max_value_c: 88.2
hour: 4, min_value_a: 0.0, max_value_a: 1046.98, min_value_b: -0.0, max_value_b: 307.05, min_value_c: 0.0, max_value_c: 176.4
hour: 5, min_value_a: 0.0, max_value_a: 2049.08, min_value_b: -0.0, max_value_b: 452.8125, min_value_c: 0.0, max_value_c: 264.6
hour: 6, min_value_a: 0.0, max_value_a: 3244.78, min_value_b: -0.0, max_value_b: 681.375, min_value_c: 0.0, max_value_c: 499.8
hour: 7, min_value_a: 0.0, max_value_a: 4266.46, min_value_b: -0.0, max_value_b: 865.0875, min_value_c: 0.0, max_value_

Second catboost model with "cloud_interaction" feature

In [15]:
x_whole = x_whole.drop("effective_solar_elevation_squared", axis=1)
x_test_whole = x_test_whole.drop("effective_solar_elevation_squared", axis=1)

In [16]:
x_whole["pv_measurement"] = y_whole
df_shuffled = x_whole.sample(frac=1, random_state=42).reset_index(drop=True)

x_whole_a = df_shuffled[df_shuffled['location_a'] == 1]
x_whole_b = df_shuffled[df_shuffled['location_b'] == 1]
x_whole_c = df_shuffled[df_shuffled['location_c'] == 1]

y_whole_a = x_whole_a["pv_measurement"]
x_whole_a = x_whole_a.drop("pv_measurement", axis = 1)
x_whole_a = x_whole_a.drop('location_a', axis = 1)
x_whole_a = x_whole_a.drop('location_b', axis = 1)
x_whole_a = x_whole_a.drop('location_c', axis = 1)

y_whole_b = x_whole_b["pv_measurement"]
x_whole_b = x_whole_b.drop("pv_measurement", axis = 1)
x_whole_b = x_whole_b.drop('location_a', axis = 1)
x_whole_b = x_whole_b.drop('location_b', axis = 1)
x_whole_b = x_whole_b.drop('location_c', axis = 1)

y_whole_c = x_whole_c["pv_measurement"]
x_whole_c = x_whole_c.drop("pv_measurement", axis = 1)
x_whole_c = x_whole_c.drop('location_a', axis = 1)
x_whole_c = x_whole_c.drop('location_b', axis = 1)
x_whole_c = x_whole_c.drop('location_c', axis = 1)
cat_features = ["estimated_flag"]

In [17]:
best_model_a = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function="MAE",
    verbose = 100
)
best_model_a.fit(x_whole_a, y_whole_a)

0:	learn: 616.9363093	total: 145ms	remaining: 2m 24s
100:	learn: 199.9656401	total: 10.6s	remaining: 1m 34s
200:	learn: 185.3356686	total: 18.3s	remaining: 1m 12s
300:	learn: 180.5070612	total: 28.8s	remaining: 1m 6s
400:	learn: 176.7196537	total: 42.9s	remaining: 1m 4s
500:	learn: 172.6197441	total: 53.6s	remaining: 53.3s
600:	learn: 163.7581476	total: 1m 3s	remaining: 42.4s
700:	learn: 153.8238135	total: 1m 12s	remaining: 31s
800:	learn: 147.6203702	total: 1m 20s	remaining: 20s
900:	learn: 141.6572310	total: 1m 28s	remaining: 9.73s
999:	learn: 136.7908527	total: 1m 36s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ac876f7fd0>

In [18]:
best_model_b = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function = "MAE",
    verbose = 100
)
best_model_b.fit(x_whole_b, y_whole_b)

0:	learn: 94.1836286	total: 139ms	remaining: 2m 19s
100:	learn: 25.3949714	total: 8.3s	remaining: 1m 13s
200:	learn: 22.1209997	total: 16.4s	remaining: 1m 5s
300:	learn: 20.7702607	total: 24s	remaining: 55.7s
400:	learn: 19.4111036	total: 32.5s	remaining: 48.5s
500:	learn: 18.1584056	total: 39.9s	remaining: 39.7s
600:	learn: 17.4038913	total: 48.2s	remaining: 32s
700:	learn: 16.7939747	total: 56s	remaining: 23.9s
800:	learn: 16.1720278	total: 1m 3s	remaining: 15.8s
900:	learn: 15.6136213	total: 1m 10s	remaining: 7.77s
999:	learn: 15.1423810	total: 1m 18s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ac876f7b20>

In [19]:
best_model_c = CatBoostRegressor(
    max_depth=9,
    cat_features=cat_features,
    loss_function = "MAE",
    verbose = 100
)
best_model_c.fit(x_whole_c, y_whole_c)

0:	learn: 78.7273693	total: 115ms	remaining: 1m 55s
100:	learn: 21.3078061	total: 8.2s	remaining: 1m 13s
200:	learn: 18.2258783	total: 17.8s	remaining: 1m 10s
300:	learn: 17.0883950	total: 25.8s	remaining: 60s
400:	learn: 16.1611729	total: 34.1s	remaining: 51s
500:	learn: 15.2436549	total: 42.9s	remaining: 42.7s
600:	learn: 14.4853484	total: 53.5s	remaining: 35.5s
700:	learn: 13.7033024	total: 1m 3s	remaining: 27.1s
800:	learn: 13.1848967	total: 1m 11s	remaining: 17.8s
900:	learn: 12.7181220	total: 1m 21s	remaining: 8.97s
999:	learn: 12.3918690	total: 1m 30s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1ac876f79d0>

In [20]:
x_whole_a = x_test_whole[x_test_whole['location_a'] == 1]
x_whole_b = x_test_whole[x_test_whole['location_b'] == 1]
x_whole_c = x_test_whole[x_test_whole['location_c'] == 1]

x_whole_a = x_whole_a.drop('location_a', axis = 1)
x_whole_a = x_whole_a.drop('location_b', axis = 1)
x_whole_a = x_whole_a.drop('location_c', axis = 1)

x_whole_b = x_whole_b.drop('location_a', axis = 1)
x_whole_b = x_whole_b.drop('location_b', axis = 1)
x_whole_b = x_whole_b.drop('location_c', axis = 1)

x_whole_c = x_whole_c.drop('location_a', axis = 1)
x_whole_c = x_whole_c.drop('location_b', axis = 1)
x_whole_c = x_whole_c.drop('location_c', axis = 1)

In [21]:
y_predictions_a = best_model_a.predict(x_whole_a)
y_predictions_b = best_model_b.predict(x_whole_b)
y_predictions_c = best_model_c.predict(x_whole_c)
y_predictions = pd.concat([pd.Series(y_predictions_a), pd.Series(y_predictions_b), pd.Series(y_predictions_c)])
y_predictions = y_predictions.reset_index(drop=True)


# Save the model
y_predictions_catboost_2 = postprocess_data(x_test_whole, pd.DataFrame(y_predictions))

hour: 0, min_value_a: 0.0, max_value_a: 3.3, min_value_b: -0.0, max_value_b: -0.0, min_value_c: 0.0, max_value_c: 0.0
hour: 1, min_value_a: 0.0, max_value_a: 53.68, min_value_b: -0.0, max_value_b: 12.075, min_value_c: 0.0, max_value_c: 9.8
hour: 2, min_value_a: 0.0, max_value_a: 233.64000000000001, min_value_b: -0.0, max_value_b: 68.1375, min_value_c: 0.0, max_value_c: 39.2
hour: 3, min_value_a: 0.0, max_value_a: 439.12, min_value_b: -0.0, max_value_b: 138.0, min_value_c: 0.0, max_value_c: 88.2
hour: 4, min_value_a: 0.0, max_value_a: 1046.98, min_value_b: -0.0, max_value_b: 307.05, min_value_c: 0.0, max_value_c: 176.4
hour: 5, min_value_a: 0.0, max_value_a: 2049.08, min_value_b: -0.0, max_value_b: 452.8125, min_value_c: 0.0, max_value_c: 264.6
hour: 6, min_value_a: 0.0, max_value_a: 3244.78, min_value_b: -0.0, max_value_b: 681.375, min_value_c: 0.0, max_value_c: 499.8
hour: 7, min_value_a: 0.0, max_value_a: 4266.46, min_value_b: -0.0, max_value_b: 865.0875, min_value_c: 0.0, max_value_

In [22]:
average_prediction = (y_predictions_catboost_1 + y_predictions_catboost_2) / 2
save_predictions(average_prediction, "short_notebook_1")

   id    prediction
0   0  0.000000e+00
1   1  2.646294e-07
2   2  2.654392e-07
3   3  5.696562e+01
4   4  3.288108e+02
