In [5]:
%pip install mlflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: C:\Users\Pratham Jain\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
# energy_predictor.py

import os
import logging
from pathlib import Path
from datetime import datetime
from typing import Tuple, Dict

import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, predict_model
from sklearn.model_selection import train_test_split
import yaml

In [7]:
class EnergyPredictor:
    """
    A class to predict Time of Use (TOU) rates and Solar Energy Generation
    using machine learning models from PyCaret.
    """
    
    def __init__(self, config_path: str = 'config.yaml'):
        self.setup_logging()
        self.load_config(config_path)
        self.models = {}
        self.predictions = {}
        self.column_mapping = {}  # Store column name mapping
        
    def setup_logging(self) -> None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('energy_predictor.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def load_config(self, config_path: str) -> None:
        try:
            with open(config_path, 'r') as file:
                self.config = yaml.safe_load(file)
            self.logger.info("Configuration loaded successfully")
        except Exception as e:
            self.logger.error(f"Error loading configuration: {str(e)}")
            raise
            
    def convert_time_to_hours(self, time_str: str) -> float:
        """
        Convert time string in HH:MM format to decimal hours.
        
        Args:
            time_str (str): Time string in HH:MM format
            
        Returns:
            float: Time in decimal hours
        """
        try:
            # Split hours and minutes
            hours, minutes = map(int, time_str.split(':'))
            # Convert to decimal hours
            decimal_hours = hours + (minutes / 60)
            return decimal_hours
        except Exception as e:
            self.logger.error(f"Error converting time {time_str}: {str(e)}")
            raise
            
    def standardize_column_names(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Standardize column names by removing spaces and special characters.
        
        Args:
            df (pd.DataFrame): Input DataFrame
            
        Returns:
            pd.DataFrame: DataFrame with standardized column names
        """
        try:
            # Create a copy to avoid modifying the original
            df = df.copy()
            
            # Create a mapping of old to new column names
            self.column_mapping = {}
            for col in df.columns:
                # Replace spaces and special characters with underscores
                new_col = col.strip().replace(' ', '_').replace('(', '').replace(')', '').replace('.', '_')
                # Remove any double underscores
                new_col = '_'.join(filter(None, new_col.split('_')))
                self.column_mapping[col] = new_col
            
            # Rename the columns
            df = df.rename(columns=self.column_mapping)
            
            # Log the column name changes
            self.logger.info("Column names standardized:")
            for old, new in self.column_mapping.items():
                if old != new:
                    self.logger.info(f"  {old} -> {new}")
            
            return df
            
        except Exception as e:
            self.logger.error(f"Error standardizing column names: {str(e)}")
            raise
            
    
    def get_standardized_column_name(self, original_name: str) -> str:
        """
        Get the standardized version of a column name.
        
        Args:
            original_name (str): Original column name
            
        Returns:
            str: Standardized column name
        """
        return self.column_mapping.get(original_name, original_name)


    def load_data(self) -> pd.DataFrame:
        """
        Load and preprocess the energy data.
        
        Returns:
            pd.DataFrame: Preprocessed DataFrame
        """
        try:
            # Load the CSV file
            df = pd.read_csv(self.config['data_path'])
            
            # Standardize column names
            df = self.standardize_column_names(df)
            
            # Convert Senddate to datetime
            df['Senddate'] = pd.to_datetime(df['Senddate'], format='%d-%m-%Y')
            
            # Convert Hours from HH:MM format to decimal hours
            df['Hours_decimal'] = df['Hours'].apply(self.convert_time_to_hours)
            
            # Convert decimal hours to timedelta
            df['Hours_timedelta'] = pd.to_timedelta(df['Hours_decimal'], unit='h')
            
            # Create Timestamp by adding date and hours
            df['Timestamp'] = df['Senddate'] + df['Hours_timedelta']
            
            # Drop intermediate columns
            df = df.drop(['Hours_timedelta', 'Hours_decimal'], axis=1)
            
            # Sort by timestamp
            df = df.sort_values('Timestamp').reset_index(drop=True)
            
            # Add time-based features
            df['Hour'] = df['Timestamp'].dt.hour
            df['Minute'] = df['Timestamp'].dt.minute
            df['Day'] = df['Timestamp'].dt.day
            df['Month'] = df['Timestamp'].dt.month
            df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
            df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
            
            # Add cyclical time features
            df['Hour_sin'] = np.sin(2 * np.pi * df['Hour']/24)
            df['Hour_cos'] = np.cos(2 * np.pi * df['Hour']/24)
            df['Month_sin'] = np.sin(2 * np.pi * df['Month']/12)
            df['Month_cos'] = np.cos(2 * np.pi * df['Month']/12)
            
            self.logger.info("Data loaded and preprocessed successfully")
            self.logger.info(f"DataFrame shape: {df.shape}")
            return df
            
        except Exception as e:
            self.logger.error(f"Error loading data: {str(e)}")
            raise


    def prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepare features for model training.
        
        Args:
            df (pd.DataFrame): Input DataFrame
            
        Returns:
            pd.DataFrame: DataFrame with prepared features
        """
        try:
            # Create a copy to avoid modifying the original
            df_prepared = df.copy()
            
            # List of columns to exclude from features
            exclude_cols = [
                'Timestamp', 
                'Senddate', 
                'Hours',  # Original time string
            ]
            
            # Remove any columns that shouldn't be used as features
            feature_cols = [col for col in df_prepared.columns if col not in exclude_cols]
            df_prepared = df_prepared[feature_cols]
            
            # Log the features being used
            self.logger.info(f"Features used: {feature_cols}")
            
            return df_prepared
            
        except Exception as e:
            self.logger.error(f"Error preparing features: {str(e)}")
            raise
            
    def train_model(self, df: pd.DataFrame, target: str, session_id: int) -> object:
        """
        Train a model for the specified target variable.
        
        Args:
            df (pd.DataFrame): Input DataFrame
            target (str): Target variable name
            session_id (int): Session ID for reproducibility
            
        Returns:
            object: Trained model
        """
        try:
            # Map the target name to the standardized version
            standardized_target = self.get_standardized_column_name(target)
            
            # Check if the standardized target column is in DataFrame
            if standardized_target not in df.columns:
                self.logger.error(f"Column '{target}' not found in data. Available columns: {df.columns}")
                raise ValueError(f"Invalid value for the target parameter. Column '{target}' not found in the data.")
            
            self.logger.info(f"Training model for target: {target}")

            # Prepare features
            df_prepared = self.prepare_features(df)
            
            # Setup PyCaret with additional parameters
            reg = setup(
                df_prepared,
                target=standardized_target,
                session_id=session_id,
                fold_strategy='timeseries',
                data_split_shuffle=False,
                fold_shuffle=False,
                log_experiment=True,
                experiment_name=f"{target}_experiment",
                feature_selection=True,
                remove_multicollinearity=True,
                multicollinearity_threshold=0.9,
                normalize=True
            )
            
            # Train and select best model
            best_model = compare_models(n_select=1)
            
            self.logger.info(f"Model training completed for {target}")
            return best_model
            
        except ValueError as ve:
            self.logger.error(f"Error training model for {target}: {str(ve)}")
            raise
        except Exception as e:
            self.logger.error(f"Error training model for {target}: {str(e)}")
            raise

            
    def generate_predictions(self, model: object, df: pd.DataFrame, target: str) -> pd.Series:
        """
        Generate predictions using the trained model.
        
        Args:
            model (object): Trained model
            df (pd.DataFrame): Input DataFrame
            target (str): Target variable name
            
        Returns:
            pd.Series: Predictions
        """
        try:
            # Prepare features for prediction
            df_prepared = self.prepare_features(df)
            
            # Generate predictions using the trained model
            predictions = predict_model(model, data=df_prepared)
            
            # Check for either 'Label' or 'prediction_label' in predictions
            if 'Label' in predictions.columns:
                label_column = 'Label'
            elif 'prediction_label' in predictions.columns:
                label_column = 'prediction_label'
            else:
                self.logger.error(f"Neither 'Label' nor 'prediction_label' column found in predictions. Available columns: {predictions.columns}")
                raise KeyError("Neither 'Label' nor 'prediction_label' column found in predictions DataFrame")
            
            self.logger.info(f"Predictions generated for {target}")
            return predictions[label_column]
            
        except KeyError as ke:
            self.logger.error(f"Prediction column error for {target}: {str(ke)}")
            raise
        except Exception as e:
            self.logger.error(f"Error generating predictions for {target}: {str(e)}")
            raise

            
    def run_prediction_pipeline(self) -> pd.DataFrame:
        """
        Run the complete prediction pipeline.
        
        Returns:
            pd.DataFrame: DataFrame with predictions
        """
        try:
            # Load data
            df = self.load_data()

            original_targets = {
                'TOU_rates': {
                    'column': 'TOU_rates_INR',
                    'session_id': 123
                },
                'Solar_generation': {
                    'column': 'Solar_energy_Generation_kWh',
                    'session_id': 456
                }
            }
            
            # Train models and generate predictions for each target
            targets = {
                name: {
                    'column': self.get_standardized_column_name(params['column']),
                    'session_id': params['session_id']
                }
                for name, params in original_targets.items()
            }
            
            for name, params in targets.items():
                # Train model
                self.models[name] = self.train_model(
                    df, 
                    params['column'],
                    params['session_id']
                )
                
                # Generate predictions
                df[f'Predicted_{params["column"]}'] = self.generate_predictions(
                    self.models[name],
                    df,
                    params['column']
                )
            
            # Save predictions
            output_path = Path(self.config['output_path'])
            output_path.parent.mkdir(parents=True, exist_ok=True)
            df.to_csv(output_path, index=False)
            
            self.logger.info(f"Predictions saved to {output_path}")
            return df
            
        except Exception as e:
            self.logger.error(f"Error in prediction pipeline: {str(e)}")
            raise


In [8]:
def main():
    """Main function to run the energy prediction pipeline."""
    try:
        predictor = EnergyPredictor()
        results = predictor.run_prediction_pipeline()
        print("Prediction pipeline completed successfully")
        
    except Exception as e:
        print(f"Error running prediction pipeline: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2024-10-26 12:29:51,869 - __main__ - INFO - Configuration loaded successfully
2024-10-26 12:29:51,890 - __main__ - INFO - Column names standardized:
2024-10-26 12:29:51,891 - __main__ - INFO -   Solar energy Generation  (kWh) -> Solar_energy_Generation_kWh
2024-10-26 12:29:51,892 - __main__ - INFO -   consumptionValue (kW) -> consumptionValue_kW
2024-10-26 12:29:51,893 - __main__ - INFO -   Device_1_Consumption (kW) -> Device_1_Consumption_kW
2024-10-26 12:29:51,894 - __main__ - INFO -   Device_2_Consumption (kW) -> Device_2_Consumption_kW
2024-10-26 12:29:51,895 - __main__ - INFO -   Device_3_Consumption (kW) -> Device_3_Consumption_kW
2024-10-26 12:29:51,896 - __main__ - INFO -   Device_4_Consumption (kW) -> Device_4_Consumption_kW
2024-10-26 12:29:51,897 - __main__ - INFO -   TOU_rates (INR) -> TOU_rates_INR
2024-10-26 12:29:51,898 - __main__ - INFO -   Cummulative Energy Consumption -> Cummulative_Energy_Consumption
2024-10-26 12:29:51,912 - __main__ - INFO - Data loaded and prepro

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1251
[LightGBM] [Info] Number of data points in the train set: 3515, number of used features: 13
[LightGBM] [Info] Start training from score 17.824005


Unnamed: 0,Description,Value
0,Session id,123
1,Target,TOU_rates_INR
2,Target type,Regression
3,Original data shape,"(5022, 18)"
4,Transformed data shape,"(5022, 4)"
5,Transformed train set shape,"(3515, 4)"
6,Transformed test set shape,"(1507, 4)"
7,Numeric features,17
8,Rows with missing values,7.5%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1.2335,11.3174,1.9288,0.6652,0.114,0.1063,0.122
lightgbm,Light Gradient Boosting Machine,1.2908,11.56,1.991,0.654,0.1163,0.1076,0.16
et,Extra Trees Regressor,1.3632,11.8618,1.99,0.6393,0.1172,0.1126,0.139
rf,Random Forest Regressor,1.3503,12.4517,2.0952,0.6174,0.121,0.1091,0.152
dt,Decision Tree Regressor,1.3467,12.5071,2.1411,0.6158,0.1233,0.1091,0.087
ada,AdaBoost Regressor,2.098,14.0836,3.0453,0.5937,0.1608,0.1446,0.098
knn,K Neighbors Regressor,2.8617,21.6687,4.1877,0.3355,0.2456,0.2122,0.09
dummy,Dummy Regressor,5.6815,40.5182,6.3546,-0.1138,0.3826,0.4178,0.091
llar,Lasso Least Angle Regression,5.7707,41.9349,6.4473,-0.1575,0.3873,0.4265,0.086
lasso,Lasso Regression,5.7707,41.9349,6.4473,-0.1575,0.3873,0.4265,0.09


2024-10-26 12:30:30,327 - __main__ - INFO - Model training completed for TOU_rates_INR
2024-10-26 12:30:30,331 - __main__ - INFO - Features used: ['Solar_energy_Generation_kWh', 'consumptionValue_kW', 'Device_1_Consumption_kW', 'Device_2_Consumption_kW', 'Device_3_Consumption_kW', 'Device_4_Consumption_kW', 'TOU_rates_INR', 'Cummulative_Energy_Consumption', 'Hour', 'Minute', 'Day', 'Month', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos']


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2383,0.3707,0.6089,0.9906,0.0327,0.0142


2024-10-26 12:30:30,600 - __main__ - INFO - Predictions generated for TOU_rates_INR
2024-10-26 12:30:30,602 - __main__ - INFO - Training model for target: Solar_energy_Generation_kWh
2024-10-26 12:30:30,605 - __main__ - INFO - Features used: ['Solar_energy_Generation_kWh', 'consumptionValue_kW', 'Device_1_Consumption_kW', 'Device_2_Consumption_kW', 'Device_3_Consumption_kW', 'Device_4_Consumption_kW', 'TOU_rates_INR', 'Cummulative_Energy_Consumption', 'Hour', 'Minute', 'Day', 'Month', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Predicted_TOU_rates_INR']


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1438
[LightGBM] [Info] Number of data points in the train set: 3515, number of used features: 13
[LightGBM] [Info] Start training from score 0.677411


Unnamed: 0,Description,Value
0,Session id,456
1,Target,Solar_energy_Generation_kWh
2,Target type,Regression
3,Original data shape,"(5022, 19)"
4,Transformed data shape,"(5022, 4)"
5,Transformed train set shape,"(3515, 4)"
6,Transformed test set shape,"(1507, 4)"
7,Numeric features,18
8,Rows with missing values,7.5%
9,Preprocess,True


2024/10/26 12:30:32 INFO mlflow.tracking.fluent: Experiment with name 'Solar_energy_Generation_kWh_experiment' does not exist. Creating a new experiment.


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,1.1027,6.2215,1.2971,-1.1294,0.4687,,0.095
et,Extra Trees Regressor,1.0346,5.6096,1.2367,-1.225,0.4592,,0.134
rf,Random Forest Regressor,1.0731,5.9458,1.2627,-1.306,0.4584,,0.176
knn,K Neighbors Regressor,1.1118,5.6694,1.3807,-1.3544,0.5204,,0.108
lightgbm,Light Gradient Boosting Machine,1.2387,6.2642,1.4458,-1.4362,0.5449,,0.141
ada,AdaBoost Regressor,1.1345,5.474,1.3713,-6.7232,0.5463,,0.098
gbr,Gradient Boosting Regressor,1.0648,5.6522,1.2803,-19.089,0.4704,,0.138
huber,Huber Regressor,1.8303,19.0092,2.0902,-34.3913,0.6269,,0.093
br,Bayesian Ridge,2.1355,18.1406,2.4422,-63.3702,0.7837,,0.106
ridge,Ridge Regression,2.1462,18.1829,2.4562,-64.0387,0.787,,0.085


2024-10-26 12:31:03,068 - __main__ - INFO - Model training completed for Solar_energy_Generation_kWh
2024-10-26 12:31:03,071 - __main__ - INFO - Features used: ['Solar_energy_Generation_kWh', 'consumptionValue_kW', 'Device_1_Consumption_kW', 'Device_2_Consumption_kW', 'Device_3_Consumption_kW', 'Device_4_Consumption_kW', 'TOU_rates_INR', 'Cummulative_Energy_Consumption', 'Hour', 'Minute', 'Day', 'Month', 'DayOfWeek', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Predicted_TOU_rates_INR']


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Decision Tree Regressor,0.047,0.1378,0.3713,0.9249,0.1693,0.0595


2024-10-26 12:31:03,297 - __main__ - INFO - Predictions generated for Solar_energy_Generation_kWh
2024-10-26 12:31:03,386 - __main__ - INFO - Predictions saved to predictions_output.csv


Prediction pipeline completed successfully
