# Model Prototyping

Comparing distributions between the observed and estimated weather. We have to use the label as a brigde for comparing the estimated and observed weather, because we don't have data of these in the same time period, and can therefore not compare them directly. The plan is to train two models, one on the estimated weather and one on the observed. If the model trained on the observed weather works fine for the labels contained in the estimated time-period and vice versa, it implies that the weather predictions are accurate.  

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [29]:
x_target_A = pd.read_parquet('../data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('../data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('../data/A/X_train_estimated.parquet')

x_target_B = pd.read_parquet('../data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('../data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('../data/B/X_train_estimated.parquet')

x_target_C = pd.read_parquet('../data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('../data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('../data/C/X_train_estimated.parquet')

In [30]:
# Data preprocessing

def keep_columns(df, columns):
    df = df[columns]
    return df

columns = [ 'time', 'clear_sky_rad:W', 'clear_sky_energy_1h:J', 'sun_elevation:d', 'is_day:idx', 'direct_rad_1h:J',
            'pv_measurement', 'diffuse_rad_1h:J', 'pressure_100m:hPa' ]


# Location A
x_train_obs_A_resampled = x_train_obs_A.set_index('date_forecast').resample('1H').mean()
x_train_est_A_resampled = x_train_est_A.set_index('date_calc').resample('1H').mean()

split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['time'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

x_train_obs_A_resampled = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='time')
x_train_est_A_resampled = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='time')

x_train_obs_A_resampled = keep_columns(x_train_obs_A_resampled, columns)
x_train_est_A_resampled = keep_columns(x_train_est_A_resampled, columns)

x_train_obs_A_resampled = x_train_obs_A_resampled.dropna()
x_train_est_A_resampled = x_train_est_A_resampled.dropna()

# Location B
x_train_obs_B_resampled = x_train_obs_B.set_index('date_forecast').resample('1H').mean()
x_train_est_B_resampled = x_train_est_B.set_index('date_calc').resample('1H').mean()

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['time'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

x_train_obs_B_resampled = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='time')
x_train_est_B_resampled = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='time')

x_train_obs_B_resampled = keep_columns(x_train_obs_B_resampled, columns)
x_train_est_B_resampled = keep_columns(x_train_est_B_resampled, columns)

x_train_obs_B_resampled = x_train_obs_B_resampled.dropna()
x_train_est_B_resampled = x_train_est_B_resampled.dropna()

# Location C
x_train_obs_C_resampled = x_train_obs_C.set_index('date_forecast').resample('1H').mean()
x_train_est_C_resampled = x_train_est_C.set_index('date_calc').resample('1H').mean()

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['time'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

x_train_obs_C_resampled = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='time')
x_train_est_C_resampled = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='time')

x_train_obs_C_resampled = keep_columns(x_train_obs_C_resampled, columns)
x_train_est_C_resampled = keep_columns(x_train_est_C_resampled, columns)

x_train_obs_C_resampled = x_train_obs_C_resampled.dropna()
x_train_est_C_resampled = x_train_est_C_resampled.dropna()

print(f'Location A obs shape: {x_train_obs_A_resampled.shape}')
print(f'Location A est shape: {x_train_est_A_resampled.shape}')
print(f'Location B obs shape: {x_train_obs_B_resampled.shape}')
print(f'Location B est shape: {x_train_est_B_resampled.shape}')
print(f'Location C obs shape: {x_train_obs_C_resampled.shape}')
print(f'Location C est shape: {x_train_est_C_resampled.shape}')


Location A obs shape: (29667, 9)
Location A est shape: (182, 9)
Location B obs shape: (29218, 9)
Location B est shape: (150, 9)
Location C obs shape: (23141, 9)
Location C est shape: (121, 9)


# Models

In [32]:
def evaluate_model_performance(y_true, y_pred, model_name, location):
    # Calculate metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Print metrics
    print(f"Performance for {model_name} at {location}:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R-squared: {r2:.2f}")
    print("-" * 50)

# Splitting data into features and target for Location A
X_train_A = x_train_obs_A_resampled.drop(columns=['time', 'pv_measurement'])
y_train_A = x_train_obs_A_resampled['pv_measurement']

X_test_A = x_train_est_A_resampled.drop(columns=['time', 'pv_measurement'])
y_test_A = x_train_est_A_resampled['pv_measurement']

# Linear Regression for Location A
lr_A = LinearRegression()
lr_A.fit(X_train_A, y_train_A)
lr_pred_A = lr_A.predict(X_test_A)
evaluate_model_performance(y_test_A, lr_pred_A, "Linear Regression", "Location A")

# Random Forest for Location A
rf_A = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=0)
rf_A.fit(X_train_A, y_train_A)
rf_pred_A = rf_A.predict(X_test_A)
evaluate_model_performance(y_test_A, rf_pred_A, "Random Forest", "Location A")

# XGBoost for Location A
xg_reg_A = xgb.XGBRegressor(objective ='reg:absoluteerror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_A.fit(X_train_A, y_train_A)
xg_pred_A = xg_reg_A.predict(X_test_A)
evaluate_model_performance(y_test_A, xg_pred_A, "XGBoost", "Location A")



Performance for Linear Regression at Location A:
Mean Absolute Error (MAE): 424.27
R-squared: 0.34
--------------------------------------------------
Performance for Random Forest at Location A:
Mean Absolute Error (MAE): 418.33
R-squared: 0.48
--------------------------------------------------
Performance for XGBoost at Location A:
Mean Absolute Error (MAE): 604.33
R-squared: -0.36
--------------------------------------------------


In [33]:
# Location B
# Splitting data into features and target for Location B
X_train_B = x_train_obs_B_resampled.drop(columns=['time', 'pv_measurement'])
y_train_B = x_train_obs_B_resampled['pv_measurement']

X_test_B = x_train_est_B_resampled.drop(columns=['time', 'pv_measurement'])
y_test_B = x_train_est_B_resampled['pv_measurement']

# Linear Regression for Location B
lr_B = LinearRegression()
lr_B.fit(X_train_B, y_train_B)
lr_pred_B = lr_B.predict(X_test_B)
evaluate_model_performance(y_test_B, lr_pred_B, "Linear Regression", "Location B")

# Random Forest for Location B
rf_B = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=0)
rf_B.fit(X_train_B, y_train_B)
rf_pred_B = rf_B.predict(X_test_B)
evaluate_model_performance(y_test_B, rf_pred_B, "Random Forest", "Location B")

# XGBoost for Location B
xg_reg_B = xgb.XGBRegressor(objective ='reg:absoluteerror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_B.fit(X_train_B, y_train_B)
xg_pred_B = xg_reg_B.predict(X_test_B)
evaluate_model_performance(y_test_B, xg_pred_B, "XGBoost", "Location B")

# Location C
# Splitting data into features and target for Location C
X_train_C = x_train_obs_C_resampled.drop(columns=['time', 'pv_measurement'])
y_train_C = x_train_obs_C_resampled['pv_measurement']

X_test_C = x_train_est_C_resampled.drop(columns=['time', 'pv_measurement'])
y_test_C = x_train_est_C_resampled['pv_measurement']

# Linear Regression for Location C
lr_C = LinearRegression()
lr_C.fit(X_train_C, y_train_C)
lr_pred_C = lr_C.predict(X_test_C)
evaluate_model_performance(y_test_C, lr_pred_C, "Linear Regression", "Location C")

# Random Forest for Location C
rf_C = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=0)
rf_C.fit(X_train_C, y_train_C)
rf_pred_C = rf_C.predict(X_test_C)
evaluate_model_performance(y_test_C, rf_pred_C, "Random Forest", "Location C")

# XGBoost for Location C
xg_reg_C = xgb.XGBRegressor(objective ='reg:absoluteerror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_C.fit(X_train_C, y_train_C)
xg_pred_C = xg_reg_C.predict(X_test_C)
evaluate_model_performance(y_test_C, xg_pred_C, "XGBoost", "Location C")

Performance for Linear Regression at Location B:
Mean Absolute Error (MAE): 55.22
R-squared: 0.49
--------------------------------------------------
Performance for Random Forest at Location B:
Mean Absolute Error (MAE): 70.57
R-squared: 0.05
--------------------------------------------------
Performance for XGBoost at Location B:
Mean Absolute Error (MAE): 81.01
R-squared: -0.31
--------------------------------------------------
Performance for Linear Regression at Location C:
Mean Absolute Error (MAE): 34.07
R-squared: 0.06
--------------------------------------------------
Performance for Random Forest at Location C:
Mean Absolute Error (MAE): 23.19
R-squared: 0.39
--------------------------------------------------
Performance for XGBoost at Location C:
Mean Absolute Error (MAE): 28.85
R-squared: -0.18
--------------------------------------------------


In [31]:
test = pd.read_csv('../data/test.csv')
x_test_est_A = pd.read_parquet('../data/A/X_test_estimated.parquet')
x_test_est_B = pd.read_parquet('../data/B/X_test_estimated.parquet')
x_test_est_C = pd.read_parquet('../data/C/X_test_estimated.parquet')

def keep_columns(df, columns):
    df = df[columns]
    return df

columns = [ 'clear_sky_rad:W', 'clear_sky_energy_1h:J', 'sun_elevation:d', 'is_day:idx', 'direct_rad_1h:J',
            'diffuse_rad_1h:J', 'pressure_100m:hPa', 'is_in_shadow:idx', 't_1000hPa:K', 'effective_cloud_cover:p' ]

# Location A
x_test_est_A_resampled = x_test_est_A.set_index('date_forecast').resample('1H').mean()
x_test_est_A_resampled = keep_columns(x_test_est_A_resampled, columns)
x_test_est_A_resampled = x_test_est_A_resampled.dropna()

# Location B
x_test_est_B_resampled = x_test_est_B.set_index('date_forecast').resample('1H').mean()
x_test_est_B_resampled = keep_columns(x_test_est_B_resampled, columns)
x_test_est_B_resampled = x_test_est_B_resampled.dropna()

# Location C
x_test_est_C_resampled = x_test_est_C.set_index('date_forecast').resample('1H').mean()
x_test_est_C_resampled = keep_columns(x_test_est_C_resampled, columns)
x_test_est_C_resampled = x_test_est_C_resampled.dropna()

In [8]:
# Predictions
y_pred_A = rf_A.predict(x_test_est_A_resampled)
y_pred_B = rf_B.predict(x_test_est_B_resampled)
y_pred_C = rf_C.predict(x_test_est_C_resampled)

# Convert predictions to Pandas Series
pred_series_A = pd.Series(y_pred_A, name="prediction")
pred_series_B = pd.Series(y_pred_B, name="prediction")
pred_series_C = pd.Series(y_pred_C, name="prediction")

# Concatenate the series
all_predictions = pd.concat([pred_series_A, pred_series_B, pred_series_C], ignore_index=True)

# Create an 'id' column
all_predictions = all_predictions.reset_index()
all_predictions.rename(columns={'index': 'id'}, inplace=True)

# Save to CSV
all_predictions.to_csv('predictions.csv', index=False)

# Stacked meta model

In [22]:
# Training on all data

# Location A
X_train_A = x_train_obs_A_resampled.drop(columns=['time', 'pv_measurement'])
y_train_A = x_train_obs_A_resampled['pv_measurement']

X_test_A = x_train_est_A_resampled.drop(columns=['time', 'pv_measurement'])
y_test_A = x_train_est_A_resampled['pv_measurement']

X_A = pd.concat([X_train_A, X_test_A])
y_A = pd.concat([y_train_A, y_test_A])

# Location B
X_train_B = x_train_obs_B_resampled.drop(columns=['time', 'pv_measurement'])
y_train_B = x_train_obs_B_resampled['pv_measurement']

X_test_B = x_train_est_B_resampled.drop(columns=['time', 'pv_measurement'])
y_test_B = x_train_est_B_resampled['pv_measurement']

X_B = pd.concat([X_train_B, X_test_B])
y_B = pd.concat([y_train_B, y_test_B])

# Location C
X_train_C = x_train_obs_C_resampled.drop(columns=['time', 'pv_measurement'])
y_train_C = x_train_obs_C_resampled['pv_measurement']

X_test_C = x_train_est_C_resampled.drop(columns=['time', 'pv_measurement'])
y_test_C = x_train_est_C_resampled['pv_measurement']

X_C = pd.concat([X_train_C, X_test_C])
y_C = pd.concat([y_train_C, y_test_C])


In [None]:
# Finding optimal random forest hyperparameters
# Define the hyperparameters and their possible values
# param_dist = {
#     'n_estimators': [10, 50, 100, 200, 500],
#     'max_depth': [None, 10, 20, 30, 40, 50],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'bootstrap': [True, False]
# }

# # Initialize the random search
# random_search = RandomizedSearchCV(
#     RandomForestRegressor(random_state=0),
#     param_distributions=param_dist,
#     n_iter=100,
#     cv=3,
#     verbose=2,
#     random_state=0,
#     n_jobs=-1
# )

# # Fit the random search model
# random_search.fit(X_A, y_A)
# random_search.fit(X_B, y_B)
# random_search.fit(X_C, y_C)

# # Get the best parameters
# best_params_A = random_search.best_params_
# best_params_B = random_search.best_params_
# best_params_C = random_search.best_params_

In [16]:
print(f"Best parameters for Location A: {best_params_A}")
print(f"Best parameters for Location B: {best_params_B}")
print(f"Best parameters for Location C: {best_params_C}")

Best parameters for Location A: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}
Best parameters for Location B: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}
Best parameters for Location C: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}


In [34]:
# Training on all data
# Best parameters from algorithm
# n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0

# Location A
X_train_A = x_train_obs_A_resampled.drop(columns=['time', 'pv_measurement'])
y_train_A = x_train_obs_A_resampled['pv_measurement']

X_test_A = x_train_est_A_resampled.drop(columns=['time', 'pv_measurement'])
y_test_A = x_train_est_A_resampled['pv_measurement']

X_A = pd.concat([X_train_A, X_test_A])
y_A = pd.concat([y_train_A, y_test_A])

# Random Forest for Location A
rf_A = RandomForestRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_A.fit(X_A, y_A)

# Location B
X_train_B = x_train_obs_B_resampled.drop(columns=['time', 'pv_measurement'])
y_train_B = x_train_obs_B_resampled['pv_measurement']

X_test_B = x_train_est_B_resampled.drop(columns=['time', 'pv_measurement'])
y_test_B = x_train_est_B_resampled['pv_measurement']

X_B = pd.concat([X_train_B, X_test_B])
y_B = pd.concat([y_train_B, y_test_B])

# Random Forest for Location B
rf_B = RandomForestRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_B.fit(X_B, y_B)

# Location C
X_train_C = x_train_obs_C_resampled.drop(columns=['time', 'pv_measurement'])
y_train_C = x_train_obs_C_resampled['pv_measurement']

X_test_C = x_train_est_C_resampled.drop(columns=['time', 'pv_measurement'])
y_test_C = x_train_est_C_resampled['pv_measurement']

X_C = pd.concat([X_train_C, X_test_C])
y_C = pd.concat([y_train_C, y_test_C])

# Random Forest for Location C
rf_C = RandomForestRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='log2', max_depth=10, bootstrap=True, random_state=0)
rf_C.fit(X_C, y_C)


In [35]:
# Stacking models into meta modell 

# Split your training data into a new training set and a validation set for Location A
X_train_A_new, X_val_A, y_train_A_new, y_val_A = train_test_split(X_train_A, y_train_A, test_size=0.2, random_state=42)
X_train_B_new, X_val_B, y_train_B_new, y_val_B = train_test_split(X_train_B, y_train_B, test_size=0.2, random_state=42)
X_train_C_new, X_val_C, y_train_C_new, y_val_C = train_test_split(X_train_C, y_train_C, test_size=0.2, random_state=42)

# Define the stacking regressor for Location A
stacked_A = StackingCVRegressor(regressors=(rf_A, lr_A, xg_reg_A),
                                meta_regressor=LinearRegression(),
                                use_features_in_secondary=True)

stacked_B = StackingCVRegressor(regressors=(rf_B, lr_B, xg_reg_B),
                                meta_regressor=LinearRegression(),
                                use_features_in_secondary=True)

stacked_C = StackingCVRegressor(regressors=(rf_C, lr_C, xg_reg_C),
                                meta_regressor=LinearRegression(),
                                use_features_in_secondary=True)

# Train the stacking regressor on the new training data
stacked_A.fit(X_train_A_new.values, y_train_A_new.values)
stacked_B.fit(X_train_B_new.values, y_train_B_new.values)
stacked_C.fit(X_train_C_new.values, y_train_C_new.values)

# Predictions
stacked_pred_A = stacked_A.predict(X_val_A.values)
stacked_pred_B = stacked_B.predict(X_val_B.values)
stacked_pred_C = stacked_C.predict(X_val_C.values)
evaluate_model_performance(y_val_A, stacked_pred_A, "Stacked Regressor", "Location A")
evaluate_model_performance(y_val_B, stacked_pred_B, "Stacked Regressor", "Location B")
evaluate_model_performance(y_val_C, stacked_pred_C, "Stacked Regressor", "Location C")




Performance for Stacked Regressor at Location A:
Mean Absolute Error (MAE): 223.30
R-squared: 0.84
--------------------------------------------------
Performance for Stacked Regressor at Location B:
Mean Absolute Error (MAE): 46.91
R-squared: 0.79
--------------------------------------------------
Performance for Stacked Regressor at Location C:
Mean Absolute Error (MAE): 25.63
R-squared: 0.88
--------------------------------------------------


In [39]:
# Predictions
y_pred_A = stacked_A.predict(x_test_est_A_resampled.drop(columns=['is_in_shadow:idx', 't_1000hPa:K', 'effective_cloud_cover:p']))
y_pred_B = stacked_B.predict(x_test_est_B_resampled.drop(columns=['is_in_shadow:idx', 't_1000hPa:K', 'effective_cloud_cover:p']))
y_pred_C = stacked_C.predict(x_test_est_C_resampled.drop(columns=['is_in_shadow:idx', 't_1000hPa:K', 'effective_cloud_cover:p']))

# Convert predictions to Pandas Series
pred_series_A = pd.Series(y_pred_A, name="prediction")
pred_series_B = pd.Series(y_pred_B, name="prediction")
pred_series_C = pd.Series(y_pred_C, name="prediction")

# Concatenate the series
all_predictions = pd.concat([pred_series_A, pred_series_B, pred_series_C], ignore_index=True)

# Create an 'id' column
all_predictions = all_predictions.reset_index()
all_predictions.rename(columns={'index': 'id'}, inplace=True)

# Save to CSV
all_predictions.to_csv('predictions.csv', index=False)

