# Using XGBoost with Preprocessed Features for Wind Production

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from wattsquad.ml_logic import preproc
import matplotlib.pyplot as plt
from xgboost import XGBRegressor


### Importing and preprocessing data

In [2]:
train = pd.read_csv("../../raw_data/train.csv")

In [3]:
X_train_transformed = preproc.transform_data(train)

➡️ preprocessing done


In [4]:
test = pd.read_csv("../../raw_data/test.csv")

In [5]:
X_test_transformed = preproc.transform_data(test)

➡️ preprocessing done


In [6]:
y_train = train['wind_production']

In [7]:
y_test = test['wind_production']

### removing wind outliers

In [8]:
wind_train = train.copy()

In [9]:
wind_train.loc[wind_train["wind_production"] < -100, "wind_production"] = 21

In [10]:
wind_train["wind_production"].describe()

count    9515.000000
mean       21.379504
std        37.237138
min        -1.280000
25%        -0.260000
50%         2.380000
75%        26.040000
max       225.500000
Name: wind_production, dtype: float64

In [11]:
y_train = wind_train['wind_production']

In [12]:
y_train.describe()

count    9515.000000
mean       21.379504
std        37.237138
min        -1.280000
25%        -0.260000
50%         2.380000
75%        26.040000
max       225.500000
Name: wind_production, dtype: float64

### Creating X_val and y_val for wind production

In [13]:
# from sklearn.model_selection import train_test_split

# # Use the same function above for the validation set
# X_train_transformed, X_val, y_train, y_val = train_test_split(
#     X_train_transformed, y_train, test_size = 0.1, random_state = 42  # val = 10%
# )

In [14]:
# X_train_transformed

## Randomized search

In [15]:
# from sklearn.model_selection import RandomizedSearchCV
# from xgboost import XGBRegressor
# import numpy as np

# # Define the parameter grid
# param_dist = {
#     'max_depth': [5, 7, 9],                     # Maximum depth of trees
#     'n_estimators': [100, 300, 500, 700],       # Number of boosting rounds
#     'learning_rate': [0.01, 0.05, 0.1],         # Learning rate (shrinkage)
#     'reg_alpha': [0, 0.01, 0.05, 0.1],          # L1 regularization term
#     'reg_lambda': [1, 10, 20, 50],              # L2 regularization term
#     'subsample': [0.7, 0.8, 0.9, 1.0],          # Fraction of samples per tree
#     'colsample_bytree': [0.7, 0.8, 0.9, 1.0],   # Fraction of features per tree
#     'min_child_weight': [1, 3, 5],              # Minimum sum of weights for child nodes
#     'gamma': [0, 1, 5],                         # Minimum loss reduction for split
# }

# # Initialize the model
# xgb_model = XGBRegressor(
#     objective='reg:squarederror',
#     eval_metric='mae',
#     random_state=42  # Ensuring reproducibility
# )

# # Set up RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_dist,
#     n_iter=50,                # Number of parameter combinations to try
#     scoring='neg_mean_absolute_error',  # Metric to optimize
#     cv=3,                     # 3-fold cross-validation
#     verbose=2,                # Print progress
#     random_state=42,          # Reproducibility
#     n_jobs=-1                 # Use all available cores
# )

# # Perform the search
# random_search.fit(X_train_transformed, y_train)

# # Display the best parameters and score
# print("Best Parameters:", random_search.best_params_)
# print("Best MAE Score:", -random_search.best_score_)

# # Retrieve the best model
# best_model = random_search.best_estimator_

# # # Save the best model if necessary
# # import joblib
# # joblib.dump(best_model, 'best_xgb_model.pkl')


In [16]:
y_test.describe()

count    841.000000
mean      20.086468
std       27.797933
min       -0.850000
25%       -0.260000
50%        9.440000
75%       30.890000
max      195.570000
Name: wind_production, dtype: float64

# Stacking models

In [17]:
# X_train_transformed.info()

In [18]:
# X_test_transformed.info()

In [19]:

# Identify the column to move and its target position
# column_to_move = df.columns[51]  # 52nd column (zero-based index is 51)
# target_position = 1  # Move to the 2nd column (zero-based index is 1)

# Reorder columns
columns = list(X_test_transformed.columns)
columns.remove(X_test_transformed.columns[52])
columns.insert(2, X_test_transformed.columns[52])

# Apply the new order to the DataFrame
X_test_transformed = X_test_transformed[columns]

# Check result
X_test_transformed.head()


Unnamed: 0,onehotencoder__precip_type:idx_0.0,onehotencoder__precip_type:idx_1.0,onehotencoder__precip_type:idx_2.0,onehotencoder__precip_type:idx_3.0,minmaxscaler__precip_1h:mm,minmaxscaler__prob_precip_1h:p,minmaxscaler__clear_sky_rad:W,minmaxscaler__clear_sky_energy_1h:J,minmaxscaler__diffuse_rad:W,minmaxscaler__diffuse_rad_1h:Wh,...,minmaxscaler__sin_sun_azimuth:d,minmaxscaler__cos_sun_azimuth:d,minmaxscaler__sin_wind_dir_2m:d,minmaxscaler__cos_wind_dir_2m:d,minmaxscaler__sin_wind_dir_10m:d,minmaxscaler__cos_wind_dir_10m:d,minmaxscaler__sin_wind_dir_50m:d,minmaxscaler__cos_wind_dir_50m:d,minmaxscaler__sin_wind_dir_100m:d,minmaxscaler__cos_wind_dir_100m:d
0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.567417,0.999371,0.278462,0.051756,0.278462,0.051756,0.221852,0.086114,0.157726,0.137618
1,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.727718,0.948728,0.343002,0.025287,0.343002,0.025287,0.283957,0.050017,0.214643,0.090813
2,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.8538,0.856272,0.352978,0.022103,0.352978,0.022103,0.298227,0.043328,0.234301,0.077625
3,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.938619,0.742218,0.373119,0.016365,0.373119,0.016365,0.315127,0.036107,0.246983,0.06981
4,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.985423,0.62122,0.422643,0.006019,0.422643,0.006019,0.373965,0.016452,0.314317,0.036312


In [20]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import numpy as np

# # Example Data (replace with your dataset)
# from sklearn.datasets import fetch_california_housing
# data = fetch_california_housing()
# X, y = data.data, data.target

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Base Models
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    gamma=5,
    colsample_bytree=1.0,
    subsample=0.9,
    reg_alpha=0.05,
    reg_lambda=10,
    random_state=42
)
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
ridge_model = Ridge(alpha=1.0)

# Define Meta-Model
meta_model = LinearRegression()

# Create Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('ridge', ridge_model)
    ],
    final_estimator=meta_model,
    n_jobs=-1
)

# Train the Stacking Regressor
stacking_regressor.fit(X_train_transformed, y_train)

# Make Predictions
y_pred = stacking_regressor.predict(X_test_transformed)

# Evaluate Performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")


Mean Absolute Error: 19.6543


In [None]:
baseline_mae = ((abs(y_train-y_train.mean())).mean())
baseline_mae

In [None]:
model_mae = ((abs(y_val-y_pred)).mean())
model_mae

In [None]:
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the 'pv_forecast' column
plt.plot(y_pred, label='Wind Forecast', color='blue', linestyle='-')

# Plot the 'pv_production' column
plt.plot(test['wind_production'], label='Wind Production', color='orange', linestyle='--')

# Add labels, legend, and title
plt.xlabel('Time (hours)')
plt.ylabel('Wind production (kWh/h)')
plt.title('Wind Forecast vs Wind Production')
plt.legend()

# Display the plot
plt.show()

# XGBoost
## Setting up the model 

In [None]:
# # nb optimal values refer to solar

# # Initialize the model with the best parameters from grid search
# xgb_reg = XGBRegressor(
#     max_depth=7,                # Optimal value found
#     n_estimators=300,           # Optimal value found
#     learning_rate=0.05,         # Optimal value found
#     reg_alpha=0.05,             # Optimal value found
#     reg_lambda=20,              # Optimal value found
#     subsample=0.8,              # Optimal value found
#     colsample_bytree=0.8,       # Optimal value found
#     objective='reg:squarederror',
#     eval_metric="mae",
#     random_state=42             # Ensuring reproducibility
# )

# # Fit the model on the training data
# xgb_reg.fit(
#     X_train_transformed, 
#     y_train,
#     eval_set=[(X_train_transformed, y_train), (X_val, y_val)],
#     early_stopping_rounds=5     # Retain early stopping
# )

# # Make predictions
# y_pred = xgb_reg.predict(X_test_transformed) # predicted wind production

## Plotting best model forecasts vs. actual wind production

In [None]:
# plt.figure(figsize=(10, 6))  # Set the figure size

# # Plot the 'pv_forecast' column
# plt.plot(y_pred, label='Wind Forecast', color='blue', linestyle='-')

# # Plot the 'pv_production' column
# plt.plot(test['wind_production'], label='Wind Production', color='orange', linestyle='--')

# # Add labels, legend, and title
# plt.xlabel('Time (hours)')
# plt.ylabel('Wind production (kWh/h)')
# plt.title('Wind Forecast vs Wind Production')
# plt.legend()

# # Display the plot
# plt.show()

In [None]:
# train["wind_production"].describe()

In [None]:
# sns.histplot(data=y_train)

In [None]:
# len(X_train_transformed)

In [None]:
# from sklearn.model_selection import train_test_split

# # Use the same function above for the validation set
# X_train_transformed, X_val, y_train, y_val = train_test_split(
#     X_train_transformed, y_train, test_size = 0.1, random_state = 42  # val = 10%
# )

In [None]:
# # nb optimal values refer to solar

# # Initialize the model with the best parameters from grid search
# xgb_reg = XGBRegressor(
#     max_depth=7,                # Optimal value found
#     n_estimators=300,           # Optimal value found
#     learning_rate=0.05,         # Optimal value found
#     reg_alpha=0.05,             # Optimal value found
#     reg_lambda=20,              # Optimal value found
#     subsample=0.8,              # Optimal value found
#     colsample_bytree=0.8,       # Optimal value found
#     objective='reg:squarederror',
#     eval_metric="mae",
#     random_state=42             # Ensuring reproducibility
# )

# # Fit the model on the training data
# xgb_reg.fit(
#     X_train_transformed, 
#     y_train,
#     eval_set=[(X_train_transformed, y_train), (X_val, y_val)],
#     early_stopping_rounds=20     # Retain early stopping
# )

# # Make predictions
# y_pred = xgb_reg.predict(X_test_transformed) # predicted wind production

In [None]:
# plt.figure(figsize=(10, 6))  # Set the figure size

# # Plot the 'pv_forecast' column
# plt.plot(y_pred, label='Wind Forecast', color='blue', linestyle='-')

# # Plot the 'pv_production' column
# plt.plot(test['wind_production'], label='Wind Production', color='orange', linestyle='--')

# # Add labels, legend, and title
# plt.xlabel('Time (hours)')
# plt.ylabel('Wind production (kWh/h)')
# plt.title('Wind Forecast vs Wind Production')
# plt.legend()

# # Display the plot
# plt.show()