In [None]:
!pip install --upgrade tpot

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tpot import TPOTRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle


In [2]:
# --- Data Collection and Preparation ---
df = pd.read_csv('../power_predict/data/merged_dataset.csv') 
df.head(5)

Unnamed: 0.1,Unnamed: 0,Month_year,Country,Balance,Combustible Renewables,Hydro,Other Renewables,Solar,"Total Renewables (Hydro, Geo, Solar, Wind, Other)",Wind,value_CDD_18,value_CDD_21,value_Global_Horizontal_Irrandiance,value_HDD_16,value_HDD_18,value_Heat_index,value_Relative_Humidty,value_Temperature,value_Total_Precipitation,"total target (wind, solar, hydro)"
0,0,2010-04-01,Australia,Net Electricity Production,216.287,1044.406,0.0,26.811,1638.098,350.511,48.13,12.62,720994.0,17.47,37.08,18.41,57.37,23.21,0.05483,1421.728
1,1,2010-04-01,Austria,Net Electricity Production,350.383,2504.13,0.0,9.965,2995.696,131.107,0.05449,0.0,736161.0,228.9,287.8,8.41,69.85,6.748,0.1051,2645.202
2,2,2010-04-01,Belgium,Net Electricity Production,383.177,141.898,0.0,80.419,702.509,97.015,0.9916,0.0,723195.0,195.8,253.7,9.577,67.9,9.322,0.03999,319.332
3,3,2010-04-01,Canada,Net Electricity Production,712.988,28243.738,2.44,24.184,29709.301,725.951,0.06883,2.3e-05,679927.0,246.7,306.2,7.797,70.85,-1.708,0.05456,28993.873
4,4,2010-04-01,Chile,Net Electricity Production,152.826,1740.416,0.0,0.0,1913.601,20.359,1.411,0.04019,578355.0,113.0,168.3,12.44,66.58,9.672,0.1578,1760.775


In [6]:
# df.columns

In [7]:
# df.dtypes

## --- Data Preprocessing ---

In [8]:
# --- Data Preprocessing ---

# Setting Country + Month year as Index
df['Country_Month'] = df['Country'] + '_' + df['Month_year'].astype(str)
df = df.set_index('Country_Month')

# Separating features and target variables
X = df.drop(['Unnamed: 0', 'Month_year', 'Balance',
       'Combustible Renewables', 'Hydro', 'Other Renewables', 'Solar',
       'Total Renewables (Hydro, Geo, Solar, Wind, Other)', 'Wind',
       'total target (wind, solar, hydro)'], axis=1) 
y = df['total target (wind, solar, hydro)'].values.ravel()  ### TPOT can only handle one target at a time, expects 1D array!

# y = df[['Solar', 'Hydro', 'Wind','total target (wind, solar, hydro)']] 

In [9]:
# Init list of numerical columns
num_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Create a preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        # Preprocessing for numerical features
        ('num', Pipeline(steps=[
            ('scaler', MinMaxScaler())  # Scale features using MinMaxScaler
        ]), num_features),
        
        # Preprocessing for categorical feature 'country'
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Country'])
    ])

preprocessing_pipeline

In [10]:
# X

In [11]:
# y

In [12]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Fit the preprocessing pipeline on the training data
X_train_processed = preprocessing_pipeline.fit_transform(X_train)

# Transform the testing data using the fitted pipeline
X_test_processed = preprocessing_pipeline.transform(X_test)

## --- Model Building with TPOT ---

In [14]:
# --- Model Building with TPOT ---
# TPOT regressor for multi-target regression
tpot_regressor = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42,
                               config_dict='TPOT sparse', cv=5)

# Pipeline including preprocessing and TPOT regressor
pipeline = Pipeline(steps=[('preprocessor', preprocessing_pipeline),
                           ('tpot_regressor', tpot_regressor)])

pipeline

## --- Model Training and Saving ---

In [15]:
# --- Model Training and Saving ---
# Train the TPOT model
pipeline.fit(X_train_processed, y_train)

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -28648681.68566545

Generation 2 - Current best internal CV score: -28283781.20621275

Generation 3 - Current best internal CV score: -28283781.20621275

Generation 4 - Current best internal CV score: -28283781.20621275

Generation 5 - Current best internal CV score: -28283781.20621275

Best pipeline: KNeighborsRegressor(input_matrix, n_neighbors=20, p=2, weights=uniform)


In [16]:
# Save fitted pipe
with open("tpot_model.pkl", "wb") as file:
    pickle.dump(tpot.fitted_pipeline_, file)

In [17]:
# Save TPOT's Internal Model Configuration
tpot_config = tpot_regressor.config_dict
with open('tpot_config.pkl', 'wb') as file:
    pickle.dump(tpot_config, file)


In [None]:
# Save params and metrics
tpot_config = tpot_regressor.config_dict
with open('tpot_config.pkl', 'wb') as file:
    pickle.dump(tpot_config, file)


## --- Model Evaluation ---

In [None]:
# --- Model Evaluation ---
# Evaluate the best model found by TPOT
y_pred = pipeline.predict(X_test_processed)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Output performance metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

# Feature importance (TPOT might select a model where feature importance is not available)
# Here, we're assuming TPOT selected a Random Forest model. If not, this part would need to be adjusted.
if 'feature_importances_' in dir(pipeline.named_steps['tpot_regressor'].fitted_pipeline_[-1]):
    feature_importances = pipeline.named_steps['tpot_regressor'].fitted_pipeline_[-1].feature_importances_

    # Create a bar chart of feature importances
    plt.barh(range(len(feature_importances)), feature_importances, align='center')
    plt.yticks(range(len(feature_importances)), features.columns)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.show()
else:
    print("Selected model does not support feature importances.")