In [2]:
!pip install --upgrade tpot



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tpot import TPOTRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
from joblib import dump
from datetime import datetime


In [4]:
# --- Data Collection and Preparation ---
df = pd.read_csv('../power_predict/data/merged_dataset2023-11-29 16:33:32.960189.csv') 
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible_Renewables,Hydro,Other_Renewables,Solar,Total_Renewables__Hydro__Geo__Solar__Wind__Other_,Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,total_sol_wind_hyd
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,48.13,12.62,720994.0,17.47,37.08,18.41,57.37,23.21,0.05483,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,0.05449,0.0,736161.0,228.9,287.8,8.41,69.85,6.748,0.1051,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,0.9916,0.0,723195.0,195.8,253.7,9.577,67.9,9.322,0.03999,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,0.06883,2.3e-05,679927.0,246.7,306.2,7.797,70.85,-1.708,0.05456,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,1.411,0.04019,578355.0,113.0,168.3,12.44,66.58,9.672,0.1578,1760.775


In [None]:
# df.columns

In [None]:
# df.dtypes

## --- Data Preprocessing ---

In [None]:
# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
       'Combustible_Renewables', 'Hydro', 'Other_Renewables', 'Solar',
       'Total_Renewables__Hydro__Geo__Solar__Wind__Other_', 'Wind',
       'total_sol_wind_hyd'], axis=1) 

y = df['total_sol_wind_hyd'].values.ravel()


## --- Model Building with TPOT ---

In [None]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
# Modify the preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())
        ]), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# TPOT regressor
tpot_regressor = TPOTRegressor(
    generations=5, 
    population_size=50, 
    verbosity=2, 
    random_state=42, 
    config_dict='TPOT sparse', 
    cv=5)

# Pipeline including preprocessing and TPOT regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),    # preprocessor integrated in pipeline
    ('tpot_regressor', tpot_regressor)
])

pipeline

## --- Model Training ---

In [None]:
# Train the TPOT model
pipeline.fit(X_train, y_train)   # preprocessor is part of pipeline so we fit on X_train without preprocessing first

## --- Model Evaluation and Saving ---

In [None]:
# --- Model Evaluation ---
# Evaluate the best model found by TPOT
y_pred = pipeline.predict(X_test)        # X_test will be automatically preprocessed by the pipeline
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Output performance metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

# Feature importance (TPOT might select a model where feature importance is not available)
# Here, we're assuming TPOT selected a Random Forest model. If not, this part would need to be adjusted.
if 'feature_importances_' in dir(pipeline.named_steps['tpot_regressor'].fitted_pipeline_[-1]):
    feature_importances = pipeline.named_steps['tpot_regressor'].fitted_pipeline_[-1].feature_importances_

    # Create a bar chart of feature importances
    plt.barh(range(len(feature_importances)), feature_importances, align='center')
    plt.yticks(range(len(feature_importances)), features.columns)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.show()
else:
    print("Selected model does not support feature importances.")

In [None]:
# Generate a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
# Extract the best model from TPOT
final_model = pipeline.named_steps['tpot_regressor'].fitted_pipeline_

# Save the best fitted model using joblib
dump(final_model, f'final_model_{timestamp}.joblib')
    

# Save Performance Metrics with Timestamp
with open(f'tpot_performance_metrics_{timestamp}.txt', 'w') as file:
    file.write(f'Mean Squared Error: {mse}\n')
    file.write(f'Mean Absolute Error: {mae}\n')

# Save the Best Parameters with Timestamp
best_params = pipeline.named_steps['tpot_regressor'].fitted_pipeline_.get_params()
with open(f'tpot_best_params_{timestamp}.txt', 'w') as file:
    file.write(str(best_params))


In [None]:
final_model
