In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def create_submission(pred_A, pred_B, pred_C, output_file="submission.csv"):
    """
    Create a Kaggle submission file.

    Parameters:
    - pred_A, pred_B, pred_C: Arrays containing predictions.
    - output_file: Name of the output CSV file.

    Returns:
    - None. Writes the submission to a CSV file.
    """
    
    # Concatenate predictions
    predictions = np.concatenate([pred_A, pred_B, pred_C])

    # Create an id array
    ids = np.arange(0, len(predictions))

    # Create a DataFrame
    df = pd.DataFrame({
        'id': ids,
        'prediction': predictions
    })

    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")

In [None]:
# Read in the data
data_path = '../../preprocessing/data'


columns_A = [
    "pv_measurement",
    "direct_rad:W",
    "diffuse_rad:W",
    "t_1000hPa:K",
    "sun_elevation:d_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change",
    "clear_sky_rad:W",
    "t_1000hPa:K_rate_of_change",
    "wind_speed_v_10m:ms",
    "clear_sky_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J_rate_of_change",
    "clear_sky_energy_1h:J_rate_of_change",
    "direct_rad:W_rate_of_change",
    "direct_rad:W_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change_of_change",
    "effective_cloud_cover:p",
    "wind_speed_u_10m:ms",
    "fresh_snow_24h:cm",
    "total_cloud_cover:p",
    "wind_speed_10m:ms",
    "visibility:m",
]

columns_B = [
    "pv_measurement",
    "direct_rad:W",
    "t_1000hPa:K",
    "diffuse_rad:W",
    "clear_sky_rad:W",
    "sun_elevation:d",
    "clear_sky_rad:W_rate_of_change",
    "direct_rad:W_rate_of_change",
    "direct_rad_1h:J",
    "clear_sky_energy_1h:J_rate_of_change",
    "clear_sky_rad:W_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change",
    "diffuse_rad_1h:J",
    "visibility:m",
    "sun_elevation:d_rate_of_change_of_change",
    "effective_cloud_cover:p",
    "diffuse_rad:W_rate_of_change_of_change",
    "clear_sky_energy_1h:J",
    "t_1000hPa:K_rate_of_change",
    "wind_speed_u_10m:ms",
    "wind_speed_v_10m:ms",
]

columns_C = [
    "pv_measurement",
    "direct_rad:W",
    "t_1000hPa:K",
    "sun_elevation:d",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "clear_sky_rad:W",
    "clear_sky_energy_1h:J",
    "direct_rad_1h:J_rate_of_change",
    "fresh_snow_24h:cm",
    "visibility:m",
    "wind_speed_u_10m:ms",
    "clear_sky_energy_1h:J_rate_of_change_of_change",
    "total_cloud_cover:p",
    "direct_rad:W_rate_of_change",
    "diffuse_rad_1h:J",
    "effective_cloud_cover:p_rate_of_change",
    "direct_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J_rate_of_change",
    "clear_sky_rad:W_rate_of_change",
    "diffuse_rad_1h:J_rate_of_change_of_change",
]

# For A
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
A = pd.concat([obs_A, est_A])[columns_A]

# For B
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
B = pd.concat([obs_B, est_B])[columns_B]

# For C
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')
C = pd.concat([obs_C, est_C])[columns_C]


columns_A = [
    "direct_rad:W",
    "diffuse_rad:W",
    "t_1000hPa:K",
    "sun_elevation:d_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change",
    "clear_sky_rad:W",
    "t_1000hPa:K_rate_of_change",
    "wind_speed_v_10m:ms",
    "clear_sky_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J_rate_of_change",
    "clear_sky_energy_1h:J_rate_of_change",
    "direct_rad:W_rate_of_change",
    "direct_rad:W_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change_of_change",
    "effective_cloud_cover:p",
    "wind_speed_u_10m:ms",
    "fresh_snow_24h:cm",
    "total_cloud_cover:p",
    "wind_speed_10m:ms",
    "visibility:m",
]

columns_B = [
    "direct_rad:W",
    "t_1000hPa:K",
    "diffuse_rad:W",
    "clear_sky_rad:W",
    "sun_elevation:d",
    "clear_sky_rad:W_rate_of_change",
    "direct_rad:W_rate_of_change",
    "direct_rad_1h:J",
    "clear_sky_energy_1h:J_rate_of_change",
    "clear_sky_rad:W_rate_of_change_of_change",
    "direct_rad_1h:J_rate_of_change",
    "diffuse_rad_1h:J",
    "visibility:m",
    "sun_elevation:d_rate_of_change_of_change",
    "effective_cloud_cover:p",
    "diffuse_rad:W_rate_of_change_of_change",
    "clear_sky_energy_1h:J",
    "t_1000hPa:K_rate_of_change",
    "wind_speed_u_10m:ms",
    "wind_speed_v_10m:ms",
]

columns_C = [
    "direct_rad:W",
    "t_1000hPa:K",
    "sun_elevation:d",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "clear_sky_rad:W",
    "clear_sky_energy_1h:J",
    "direct_rad_1h:J_rate_of_change",
    "fresh_snow_24h:cm",
    "visibility:m",
    "wind_speed_u_10m:ms",
    "clear_sky_energy_1h:J_rate_of_change_of_change",
    "total_cloud_cover:p",
    "direct_rad:W_rate_of_change",
    "diffuse_rad_1h:J",
    "effective_cloud_cover:p_rate_of_change",
    "direct_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J_rate_of_change",
    "clear_sky_rad:W_rate_of_change",
    "diffuse_rad_1h:J_rate_of_change_of_change",
]

# For testing

test_A = pd.read_parquet(f'{data_path}/test_A.parquet')[columns_A].dropna()
test_B = pd.read_parquet(f'{data_path}/test_B.parquet')[columns_B].dropna()
test_C = pd.read_parquet(f'{data_path}/test_C.parquet')[columns_C].dropna()


print(A.columns)