In [64]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import median_absolute_error
from src.data.data_fetcher import get_raw_data
from src.features.feature_engineering import prepare_data
import pandas as pd

In [65]:


train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

In [66]:

# Call the function with the file paths
X_train_a, y_train_a, X_val_a, y_val_a, X_test_a, train_data_a, val_data_a = prepare_data(train_a, 
                                                                            X_train_observed_a, 
                                                                            X_train_estimated_a, 
                                                                            X_test_estimated_a)
X_train_b, y_train_b, X_val_b, y_val_b, X_test_b, train_data_b, val_data_b = prepare_data(train_b, 
                                                                            X_train_observed_b, 
                                                                            X_train_estimated_b, 
                                                                            X_test_estimated_b)

X_train_c, y_train_c, X_val_c, y_val_c, X_test_c, train_data_c, val_data_c = prepare_data(train_c,
                                                                            X_train_observed_c, 
                                                                            X_train_estimated_c, 
                                                                            X_test_estimated_c)


# Check the prepared DataFrames
X_train_a.head(1), y_train_a.head(1), X_val_a.head(1), y_val_a.head(1), X_test_a.head(1), train_data_a.head(1), val_data_a.head(1)


(                     absolute_humidity_2m:gm3  air_density_2m:kgm3  \
 2019-12-01 09:00:00                       4.8                1.272   
 
                      ceiling_height_agl:m  clear_sky_energy_1h:J  \
 2019-12-01 09:00:00                456.25                27331.5   
 
                      clear_sky_rad:W  cloud_base_agl:m  dew_or_rime:idx  \
 2019-12-01 09:00:00        17.549999            456.25              0.0   
 
                      dew_point_2m:K  diffuse_rad:W  diffuse_rad_1h:J  ...  \
 2019-12-01 09:00:00      273.150024          5.175       9751.150391  ...   
 
                      sun_azimuth:d  sun_elevation:d  \
 2019-12-01 09:00:00     155.780746          2.32725   
 
                      super_cooled_liquid_water:kgm2  t_1000hPa:K  \
 2019-12-01 09:00:00                            0.25   274.350006   
 
                      total_cloud_cover:p  visibility:m  wind_speed_10m:ms  \
 2019-12-01 09:00:00                100.0   7155.975098               4.

In [67]:
# X_val contains features not present in X_train
# Drop those features from X_val
X_val_a = X_val_a.drop(['date_calc'], axis=1)
X_val_b = X_val_b.drop(['date_calc'], axis=1)
X_val_c = X_val_c.drop(['date_calc'], axis=1)

# Initialize a Gradient Boosting regressor
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Train the Gradient Boosting model on the cleaned training data
gb_model.fit(X_train_a, y_train_a)

# Predict on the cleaned validation set
y_val_pred_gb_a = gb_model.predict(X_val_a)
y_val_pred_gb_b = gb_model.predict(X_val_b)
y_val_pred_gb_c = gb_model.predict(X_val_c)

# Evaluate the Gradient Boosting model using MAE on the cleaned validation set
mae_gb_a = median_absolute_error(y_val_a, y_val_pred_gb_a)
mae_gb_b = median_absolute_error(y_val_b, y_val_pred_gb_b)
mae_gb_c = median_absolute_error(y_val_c, y_val_pred_gb_c)
print("Validation MAE for Gradient Boosting Model: {:.2f}".format(mae_gb_a))
print("Validation MAE for Gradient Boosting Model: {:.2f}".format(mae_gb_b))
print("Validation MAE for Gradient Boosting Model: {:.2f}".format(mae_gb_c))

# All Predictions
predictions = pd.concat([pd.DataFrame(y_val_pred_gb_a), pd.DataFrame(y_val_pred_gb_b), pd.DataFrame(y_val_pred_gb_c)], axis=0)
# Add predictions together

# Save the predictions to a CSV file
predictions.to_csv('data/processed/predictions.csv', index=False, header=False)




Validation MAE for Gradient Boosting Model: 2.16
Validation MAE for Gradient Boosting Model: 27.70
Validation MAE for Gradient Boosting Model: 7.48
