In [12]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
from datetime import datetime
import numpy as np
import pandas as pd

In [13]:
# --- Data Collection and Preparation ---
df = pd.read_csv('../power_predict/data/merged_dataset.csv') 
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible Renewables,Hydro,Other Renewables,Solar,"Total Renewables (Hydro, Geo, Solar, Wind, Other)",Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,"total target (wind, solar, hydro)"
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,0.139958,-0.151031,0.523027,-0.874051,-0.913969,0.754759,-1.661187,1.3505,-0.775001,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,-0.559862,-0.445247,0.573405,0.311842,0.37124,-0.473498,-0.406227,-0.518854,-0.144525,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,-0.546221,-0.445247,0.530337,0.126186,0.196441,-0.330161,-0.602314,-0.226562,-0.961121,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,-0.559654,-0.445247,0.38662,0.41168,0.46556,-0.548791,-0.305669,-1.479081,-0.778387,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,-0.540116,-0.44431,0.049241,-0.338232,-0.241326,0.021489,-0.735051,-0.186817,0.516427,1760.775


In [14]:
# df.columns

In [15]:
# df.dtypes

## --- Data Preprocessing ---

In [16]:
# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
       'Combustible Renewables', 'Hydro', 'Other Renewables', 'Solar',
       'Total Renewables (Hydro, Geo, Solar, Wind, Other)', 'Wind',
       'total target (wind, solar, hydro)'], axis=1) 
# y = df[['total target (wind, solar, hydro)']]  ### TPOT can only handle one target at a time!

y = df[['Solar', 'Hydro', 'Wind','total target (wind, solar, hydro)']] 

In [22]:
X

Unnamed: 0_level_0,Country,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation
Country_Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia_2010-04-01,Australia,0.139958,-0.151031,0.523027,-0.874051,-0.913969,0.754759,-1.661187,1.350500,-0.775001
Austria_2010-04-01,Austria,-0.559862,-0.445247,0.573405,0.311842,0.371240,-0.473498,-0.406227,-0.518854,-0.144525
Belgium_2010-04-01,Belgium,-0.546221,-0.445247,0.530337,0.126186,0.196441,-0.330161,-0.602314,-0.226562,-0.961121
Canada_2010-04-01,Canada,-0.559654,-0.445247,0.386620,0.411680,0.465560,-0.548791,-0.305669,-1.479081,-0.778387
Chile_2010-04-01,Chile,-0.540116,-0.444310,0.049241,-0.338232,-0.241326,0.021489,-0.735051,-0.186817,0.516427
...,...,...,...,...,...,...,...,...,...,...
Spain_2022-09-01,Spain,1.081340,0.683360,0.615489,-0.940186,-1.034073,1.117095,-1.372586,1.007562,-0.594901
Sweden_2022-09-01,Sweden,-0.560656,-0.445247,-0.826916,-0.203618,-0.094207,-0.102565,0.653651,-0.237122,-0.303429
Switzerland_2022-09-01,Switzerland,-0.502473,-0.445247,0.005211,-0.358424,-0.296175,0.075533,0.545049,-0.112098,1.646443
United Kingdom_2022-09-01,United Kingdom,-0.519125,-0.445247,-0.467897,-0.603702,-0.523260,0.252402,0.666723,0.281941,0.359655


In [17]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Create a preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('scaler', MinMaxScaler())]), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## --- Building Model ---

In [20]:
# Model Building with Grid Search
rf = RandomForestRegressor(random_state=42)
pipeline = Pipeline([('preprocessor', preprocessing_pipeline), ('regressor', rf)])

# Parameters for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

In [19]:
# Train the model using Grid Search
grid_search.fit(X_train, y_train)  # Use X_train directly

## --- Model Evaluation ---

In [None]:
# --- Model Evaluation ---
# Evaluate the best model found by Grid Search
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_processed)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
# Outputting performance metrics and best model parameters
print("Best Model Parameters:", grid_search.best_params_)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

In [None]:
# Feature importance from the best model's Random Forest regressor
feature_importances = best_model.named_steps['regressor'].feature_importances_
feature_importances

In [34]:
# Create a bar chart of feature importances
plt.barh(range(len(feature_importances)), feature_importances, align='center')
plt.yticks(range(len(feature_importances)), features.columns)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

## --- Saving Trained Model, Params, Metrics ---

In [None]:
# Save trained model
with open('grid_search_model.pkl', 'wb') as file:
    pickle.dump(grid_search, file)

# Save best params and metrics
best_params = grid_search.best_params_
best_score = grid_search.best_score_

with open('best_params.txt', 'w') as file:
    file.write(str(best_params))
with open('best_score.txt', 'w') as file:
    file.write(str(best_score))


# Save Complete Grid Search Results

results = pd.DataFrame(grid_search.cv_results_)
results.to_csv('grid_search_full_results.csv', index=False)