In [35]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
from datetime import datetime

In [2]:
# --- Data Collection and Preparation ---
df = pd.read_csv('../power_predict/data/merged_dataset.csv') 
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible Renewables,Hydro,Other Renewables,Solar,"Total Renewables (Hydro, Geo, Solar, Wind, Other)",Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,"total target (wind, solar, hydro)"
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,48.13,12.62,720994.0,17.47,37.08,18.41,57.37,23.21,0.05483,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,0.05449,0.0,736161.0,228.9,287.8,8.41,69.85,6.748,0.1051,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,0.9916,0.0,723195.0,195.8,253.7,9.577,67.9,9.322,0.03999,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,0.06883,2.3e-05,679927.0,246.7,306.2,7.797,70.85,-1.708,0.05456,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,1.411,0.04019,578355.0,113.0,168.3,12.44,66.58,9.672,0.1578,1760.775


In [3]:
# df.columns

In [4]:
# df.dtypes

## --- Data Preprocessing ---

In [5]:
# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
       'Combustible Renewables', 'Hydro', 'Other Renewables', 'Solar',
       'Total Renewables (Hydro, Geo, Solar, Wind, Other)', 'Wind',
       'total target (wind, solar, hydro)'], axis=1) 
# y = df[['total target (wind, solar, hydro)']]  ### TPOT can only handle one target at a time!

y = df[['Solar', 'Hydro', 'Wind','total target (wind, solar, hydro)']] 

In [6]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Create a preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        # Preprocessing for numerical features
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())  # Scale features using MinMaxScaler
        ]), num_features),
        
        # Preprocessing for categorical feature 'country'
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

preprocessing_pipeline

In [7]:
# X

In [8]:
# y

In [9]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# X_test.head(20)

In [31]:
# Fit the preprocessing pipeline on the training data
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)

# Transform the testing data using the fitted pipeline
X_test_transformed = preprocessing_pipeline.transform(X_test)

# type(X_test_transformed)

# Converting outputs from sparse to dense np arrays
X_train_processed = X_train_transformed.toarray()
X_test_processed = X_test_transformed.toarray()

In [34]:
# type(X_test_processed_dense)

## --- Model Building with Grid Search ---

In [13]:
# --- Model Building with Grid Search ---
# Random Forest regressor
rf = RandomForestRegressor(random_state=42) ## another random state?

# Pipeline including preprocessing and the regressor
pipeline = Pipeline(steps=[('preprocessor', preprocessing_pipeline),
                           ('regressor', rf)])

pipeline

In [15]:
# Parameters for GridSearchCV
param_grid = {           ## try different ones given initial results and iterate
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

## --- Model Training ---

In [16]:
# --- Model Training and Saving ---

# Training the model using Grid Search
grid_search.fit(X_train_processed, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

## --- Model Evaluation ---

In [None]:
# --- Model Evaluation ---
# Evaluate the best model found by Grid Search
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_processed)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
# Outputting performance metrics and best model parameters
print("Best Model Parameters:", grid_search.best_params_)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

In [None]:
# Feature importance from the best model's Random Forest regressor
feature_importances = best_model.named_steps['regressor'].feature_importances_
feature_importances

In [34]:
# Create a bar chart of feature importances
plt.barh(range(len(feature_importances)), feature_importances, align='center')
plt.yticks(range(len(feature_importances)), features.columns)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Save trained model
with open('grid_search_model.pkl', 'wb') as file:
    pickle.dump(grid_search, file)

# Save best params and metrics
best_params = grid_search.best_params_
best_score = grid_search.best_score_

with open('best_params.txt', 'w') as file:
    file.write(str(best_params))
with open('best_score.txt', 'w') as file:
    file.write(str(best_score))


# Save Complete Grid Search Results

results = pd.DataFrame(grid_search.cv_results_)
results.to_csv('grid_search_full_results.csv', index=False)