In [3]:
# Data libraries
import pandas as pd
import numpy as np
from tqdm import tqdm  # for the progress bar

# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
def create_submission(pred_A, pred_B, pred_C, output_file="submission.csv"):
    """
    Create a Kaggle submission file.

    Parameters:
    - pred_A, pred_B, pred_C: Arrays containing predictions.
    - output_file: Name of the output CSV file.

    Returns:
    - None. Writes the submission to a CSV file.
    """
    
    # Concatenate predictions
    predictions = np.concatenate([pred_A, pred_B, pred_C])

    # Create an id array
    ids = np.arange(0, len(predictions))

    # Create a DataFrame
    df = pd.DataFrame({
        'id': ids,
        'prediction': predictions
    })

    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")


# Read in the data
data_path = '../preprocessing/data'
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')

test_A = pd.read_parquet(f'{data_path}/test_A.parquet').dropna()
test_B = pd.read_parquet(f'{data_path}/test_B.parquet').dropna()
test_C = pd.read_parquet(f'{data_path}/test_C.parquet').dropna()


In [6]:
obs_A.columns

Index(['pv_measurement', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'clear_sky_rad:W', 'dew_point_2m:K', 'diffuse_rad:W', 'direct_rad:W',
       'effective_cloud_cover:p', 'msl_pressure:hPa', 'pressure_100m:hPa',
       'pressure_50m:hPa', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_water:kgm2', 'sun_azimuth:d', 'sun_elevation:d', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'clear_sky_energy_1h:J',
       'diffuse_rad_1h:J', 'direct_rad_1h:J', 'month', 'year', 'time_of_day'],
      dtype='object')

In [41]:
base_models = [
    ('lr', LinearRegression(n_jobs=-1)),
    ('rf', RandomForestRegressor(n_estimators=100, criterion='absolute_error', n_jobs=-1)),
    ('ada', AdaBoostRegressor(n_estimators=50)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1))
]

final_estimator = RandomForestRegressor(n_estimators=100, criterion='absolute_error')


In [42]:
A = est_A

# Split to features and labels
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']

X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A, y_A, test_size=0.2, shuffle=False)

# Create StackingRegressor instances
stack_A = StackingRegressor(estimators=base_models, final_estimator=final_estimator)

# Fit the StackingRegressor on the data
stack_A.fit(X_train_A, y_train_A)

# Make predictions
pred_A = stack_A.predict(test_A)

pred_A = np.clip(pred_A, 0, None)

In [43]:
B = est_B

# Split to features and labels
X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']

X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B, y_B, test_size=0.2, shuffle=False)

# Create StackingRegressor instances
stack_B = StackingRegressor(estimators=base_models, final_estimator=final_estimator)

# Fit the StackingRegressor on the data
stack_B.fit(X_train_B, y_train_B)

# Make predictions
pred_B = stack_B.predict(test_B)

pred_B = np.clip(pred_B, 0, None)


In [44]:
C = est_C

# Split to features and labels
X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']

X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, test_size=0.2, shuffle=False)

# Create StackingRegressor instances
stack_C = StackingRegressor(estimators=base_models, final_estimator=final_estimator)

# Fit the StackingRegressor on the data
stack_C.fit(X_train_C, y_train_C)

# Make predictions
pred_C = stack_C.predict(test_C)

pred_C = np.clip(pred_C, 0, None)


In [45]:
create_submission(pred_A, pred_B, pred_C, output_file="../submission.csv")

Submission saved to ../submission.csv
