In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from azureml.core import Workspace
from datetime import datetime

In [20]:
class DataPreparation:
    """
    Class for preparing data. Includes loading, cleaning, and splitting.
    This is part of the Data Preparation stage of the ML lifecycle.
    """
    def __init__(self, data_path):
        self.data_path = data_path

    def load_data(self):
        """Load the dataset from the specified path."""
        df = pd.read_csv(self.data_path)
        print(f"Data loaded successfully from {self.data_path}")
        return df

    def preprocess_data(self, df):
        """Preprocess the dataset by separating features and target."""
        # Separate features and target
        X = df.drop(columns=["Energy_Requirement"])
        y = df["Energy_Requirement"].apply(lambda x: 1 if x == "Yes" else 0)
        print("Data preprocessing completed.")
        return X, y

    def split_data(self, X, y, test_size=0.2, random_state=42):
        """Split the dataset into training and testing sets."""
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        print("Data split into training and testing sets.")
        return X_train, X_test, y_train, y_test

In [24]:
class ExperimentManager:
    """
    Class for managing ML experiments with multiple models.
    This is part of the Model Training and Experimentation stage of the ML lifecycle.
    """
    def __init__(self, experiment_name, workspace_config):
        self.experiment_name = experiment_name
        # Load the Azure ML workspace
        self.ws = Workspace.from_config(workspace_config)
        # Set up MLflow for tracking
        mlflow.set_tracking_uri(self.ws.get_mlflow_tracking_uri())
        mlflow.set_experiment(self.experiment_name)
        print(f"Experiment '{self.experiment_name}' is set up in MLflow.")

    def train_and_log_models(self, models, X_train, y_train, X_test, y_test):
        """
        Train and log multiple models in MLflow.
        Logs model parameters, metrics, and artifacts.
        """
        for model_name, model in models.items():
            # Dynamically name the run based on the model and current date
            run_name = f"{model_name}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}"
            with mlflow.start_run(run_name=run_name):
                print(f"Training and logging model: {model_name}")
                # Log model parameters
                mlflow.log_param("model_name", model_name)
                

                # Train the model
                model.fit(X_train, y_train)

                # Predict on test data
                y_pred = model.predict(X_test)

                # Evaluate the model
                metrics = {
                    "accuracy": accuracy_score(y_test, y_pred),
                    "precision": precision_score(y_test, y_pred),
                    "recall": recall_score(y_test, y_pred),
                    "f1_score": f1_score(y_test, y_pred)
                }

                # Log metrics
                for metric_name, metric_value in metrics.items():
                    mlflow.log_metric(metric_name, metric_value)

                # Log the trained model
                mlflow.sklearn.log_model(model, artifact_path="models")

                # Print logged metrics for debugging
                print(f"Model: {model_name}, Metrics: {metrics}")

        print(f"Experiment '{self.experiment_name}' completed. Check Azure ML Studio for results.")


In [25]:
class DataSaver:
    """
    Class for saving prediction data for future batch prediction or testing.
    This is part of the Deployment/Inference Preparation stage of the ML lifecycle.
    """
    @staticmethod
    def save_data(X_test, save_path):
        """Save the test dataset for predictions."""
        X_test.to_csv(save_path, index=False)
        print(f"Prediction data saved at: {save_path}")

In [26]:
# Main script
if __name__ == "__main__":
    # Paths and configurations
    DATA_PATH = "../data/energy_data/input_data/energy_data.csv"
    PREDICTION_FILE = "../data/energy_data/input_data/prediction_data.csv"
    WORKSPACE_CONFIG = "./config.json"
    EXPERIMENT_NAME = "energy-requirement-prediction"

    # Step 1: Data Preparation
    data_prep = DataPreparation(DATA_PATH)
    df = data_prep.load_data()
    X, y = data_prep.preprocess_data(df)
    X_train, X_test, y_train, y_test = data_prep.split_data(X, y)

    # Step 2: Save prediction data
    DataSaver.save_data(X_test, PREDICTION_FILE)

    # Step 3: Experimentation
    models = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=44),
        "Logistic Regression": LogisticRegression(max_iter=200, random_state=44),
        "Decision Tree": DecisionTreeClassifier(random_state=44),
        "Support Vector Machine": SVC(probability=True, random_state=44)
    }

    experiment_manager = ExperimentManager(EXPERIMENT_NAME, WORKSPACE_CONFIG)
    experiment_manager.train_and_log_models(models, X_train, y_train, X_test, y_test)

Data loaded successfully from ../data/energy_data/input_data/energy_data.csv
Data preprocessing completed.
Data split into training and testing sets.
Prediction data saved at: ../data/energy_data/input_data/prediction_data.csv
Experiment 'energy-requirement-prediction' is set up in MLflow.
Training and logging model: Random Forest
Model: Random Forest, Metrics: {'accuracy': 0.95, 'precision': 0.9431818181818182, 'recall': 0.9431818181818182, 'f1_score': 0.9431818181818182}
Training and logging model: Logistic Regression
Model: Logistic Regression, Metrics: {'accuracy': 0.835, 'precision': 0.8089887640449438, 'recall': 0.8181818181818182, 'f1_score': 0.8135593220338984}
Training and logging model: Decision Tree
Model: Decision Tree, Metrics: {'accuracy': 0.895, 'precision': 0.8850574712643678, 'recall': 0.875, 'f1_score': 0.88}
Training and logging model: Support Vector Machine
Model: Support Vector Machine, Metrics: {'accuracy': 0.92, 'precision': 0.9090909090909091, 'recall': 0.909090

