## Section A: Imports

In [None]:
import random
import time
import numpy as np
from deap import base, creator, tools, algorithms
import pm4py
import os
from pm4py.algo.discovery.inductive import variants as im_variants
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive import variants
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.objects.conversion.heuristics_net import converter as hn_converter
from pm4py.algo.filtering.log.variants import variants_filter
from pm4py.algo.filtering.log.attributes import attributes_filter

# For reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

## Section B: Load Data and Define Evaluation Functions

In [None]:
from pm4py.algo.filtering.log.start_activities import start_activities_filter
from pm4py.algo.filtering.log.end_activities import end_activities_filter

def filter_log_by_start_and_end(log, start_activity, end_activity):
    """
    Filter a log to keep only traces that start with `start_activity` and end with `end_activity`.

    :param log: The input event log (PM4Py log object)
    :param start_activity: The required starting activity (string)
    :param end_activity: The required ending activity (string)
    :return: A filtered log with only the desired traces
    """
    # Filter traces that start with the specified activity
    log_start_filtered = start_activities_filter.apply(log, {start_activity})

    # Filter traces that end with the specified activity
    log_filtered = end_activities_filter.apply(log_start_filtered, {end_activity})

    return log_filtered

In [None]:
# Assume we have a log file path
# EVENT_LOG_PATH = "../data/PermitLog_Sample.xes"  # Update this with the actual path
EVENT_LOG_PATH = "../data/PermitLog.xes"  # Update this with the actual path
# EVENT_LOG_PATH = "/content/drive/MyDrive/Masters & PhD/PhD/05_Courses/01_AutoML_Course/Project 2/PermitLog_Sample.xes"  # Update this with the actual path
# EVENT_LOG_PATH = "/content/drive/MyDrive/Masters & PhD/PhD/05_Courses/01_AutoML_Course/Project 2/PermitLog.xes"  # Update this with the actual path

# Load the event log
log = pm4py.read_xes(EVENT_LOG_PATH)
log = filter_log_by_start_and_end(log, "Permit SUBMITTED by EMPLOYEE", "Payment Handled")
# Keep only frequent variants (e.g., top 80%)
#log = variants_filter.apply(log, parameters={"decreasingFactor": 0.8})
# Remove rare activities (e.g., occurring in less than 5% of cases)
#log = attributes_filter.filter_event_attribute_values(log, "concept:name", retainPercentage=0.95)

def discover_model(algorithm, params):
    if algorithm == "alpha":
        from pm4py.algo.discovery.alpha import algorithm as alpha_miner
        net, im, fm = alpha_miner.apply(log)
        return net, im, fm
    elif algorithm == "heuristic":
        from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
        dep_thresh = params.get("dependency_thresh", 0.5)
        and_thresh = params.get("and_thresh", 0.5)
        heu_net = heuristics_miner.apply_heu(
            log,
            parameters={
                heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: dep_thresh,
                heuristics_miner.Variants.CLASSIC.value.Parameters.AND_MEASURE_THRESH: and_thresh
            }
        )
        # Convert the HeuristicsNet to Petri net
        net, im, fm = hn_converter.apply(heu_net)
        return net, im, fm
    elif algorithm == "inductive":
        from pm4py.algo.discovery.inductive import algorithm as inductive_miner
        from pm4py.algo.discovery.inductive import variants
        from pm4py.objects.conversion.process_tree import converter as pt_converter

        variant = params.get("variant", variants.imf)
        process_tree = inductive_miner.apply(log, variant=variant)
        net, im, fm = pt_converter.apply(process_tree, variant=pt_converter.Variants.TO_PETRI_NET)
        return net, im, fm
    else:
        raise ValueError("Unknown algorithm specified.")


In [None]:
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
from pm4py.algo.evaluation.generalization import algorithm as generalization_algorithm
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_algorithm

def evaluate_model(net, im, fm, log):
    """
    Evaluate the discovered model using fitness, generalization, and simplicity.
    """
    #try:
    # Token-based replay for fitness
    fitness_res = token_replay.apply(log, net, im, fm)
    # print(f"Fitness result: {fitness_res}, type: {type(fitness_res)}")

    # Calculate average fitness from trace fitness values
    if isinstance(fitness_res, list):
        total_fitness = sum(res["trace_fitness"] for res in fitness_res if "trace_fitness" in res)
        num_traces = len(fitness_res)
        fitness = total_fitness / num_traces if num_traces > 0 else 0.0
    else:
        fitness = 0.0

    # Generalization evaluation (using token-based variant)
    generalization_value = generalization_algorithm.apply(log, net, im, fm)#, variant=generalization_algorithm.Variants.TOKEN_BASED)

    # Simplicity evaluation
    simplicity_value = simplicity_algorithm.apply(net)

    #except Exception as e:
    #    print(f"Exception during evaluation: {e}")
    #    fitness, generalization_value, simplicity_value = 0.0, 0.0, 0.0

    return fitness, generalization_value, simplicity_value


## Section C: Hyper-parameter Optimization Setup

In [None]:
# Define the search space
# We'll encode the individual as:
# [algorithm_index, heuristic_dep_thresh_index, heuristic_and_thresh_index, inductive_noise_index, inductive_variant_index]
# algorithm_index = 0: alpha, 1: heuristic, 2: inductive

# Alpha Miner hyperparameters
alpha_remove_loops_values = [True, False]
alpha_ignore_noise_values = [True, False]

# Heuristic Miner hyperparameters
heuristic_dep_values = [0.0, 0.2, 0.5, 0.9]
heuristic_and_values = [0.0, 0.2, 0.5, 0.9]
heuristic_noise_values = [0.0, 0.1, 0.3, 0.5]

# Inductive Miner hyperparameters
inductive_noise_values = [0.0, 0.1, 0.2, 0.3, 0.5]
inductive_variant_values = ["im", "imd", "imf"]  # Variants for inductive miner
inductive_activity_frequency_values = [0.0, 0.1, 0.3, 0.5, 0.7, 1.0]

# Algorithm selection
algos = ["alpha", "heuristic", "inductive"]

# Inductive variants - you need to adapt as per PM4Py variants
from pm4py.algo.discovery.inductive.variants import imf, imd
inductive_variant_values = [imf, imd]

# For alpha miner, parameters won't matter, but we keep them as placeholders.

# Fitness definitions:
# For single objective: maximize fitness
creator.create("FitnessSingle", base.Fitness, weights=(1.0,))
creator.create("IndividualSingle", list, fitness=creator.FitnessSingle)

# For multi-objective: maximize fitness, generalization, simplicity
creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0, 1.0))  # Adjust weights if needed
creator.create("IndividualMulti", list, fitness=creator.FitnessMulti)

def init_individual_single(icls):
    # Random initialization of an individual for single-objective scenario
    alg_idx = random.randint(0, 2)
    heuristic_dep_idx = random.randint(0, len(heuristic_dep_values)-1)
    heuristic_and_idx = random.randint(0, len(heuristic_and_values)-1)
    inductive_noise_idx = random.randint(0, len(inductive_noise_values)-1)
    inductive_variant_idx = random.randint(0, len(inductive_variant_values)-1)
    return icls([alg_idx, heuristic_dep_idx, heuristic_and_idx, inductive_noise_idx, inductive_variant_idx])

def init_individual_multi(icls):
    # Same initialization for multi-objective individuals
    return init_individual_single(icls)

toolbox_single = base.Toolbox()
toolbox_single.register("individual", init_individual_single, creator.IndividualSingle)
toolbox_single.register("population", tools.initRepeat, list, toolbox_single.individual)

toolbox_multi = base.Toolbox()
toolbox_multi.register("individual", init_individual_multi, creator.IndividualMulti)
toolbox_multi.register("population", tools.initRepeat, list, toolbox_multi.individual)

In [None]:
# Define the individual encoding and decoding
def decode_individual(ind):
    """
    Decode the individual to algorithm and parameter dict.
    """
    alg = algos[ind[0]]
    if alg == "alpha":
        params = {
            "remove_loops": alpha_remove_loops_values[ind[1]],
            "ignore_noise": alpha_ignore_noise_values[ind[2]]
        }
    elif alg == "heuristic":
        params = {
            "dependency_thresh": heuristic_dep_values[ind[1]],
            "and_thresh": heuristic_and_values[ind[2]],
            "noise_thresh": heuristic_noise_values[ind[3]]
        }
    elif alg == "inductive":
        params = {
            "noise_thresh": inductive_noise_values[ind[1]],
            "variant": inductive_variant_values[ind[2]],
            "activity_frequency_filter": inductive_activity_frequency_values[ind[3]]
        }
    return alg, params

In [None]:
def evaluate_single_objective(individual):
    """
    Evaluate the individual using a single objective (e.g., fitness).
    """
    # Decode the individual's algorithm and parameters
    alg, params = decode_individual(individual)

    #try:
    # Discover the model based on the algorithm and parameters
    net, im, fm = discover_model(alg, params)

    # Calculate fitness only
    fitness, _, _ = evaluate_model(net, im, fm, log)

    #except Exception as e:
        # Handle cases where discovery or evaluation fails
    #    print(f"Error evaluating individual: {e}")
    #    fitness = 0.0

    return (fitness,)  # Return as a tuple for compatibility with DEAP

In [None]:
def evaluate_multi_objective(individual):
    """
    Evaluate the individual using multiple objectives (e.g., fitness, generalization, simplicity).
    """
    # Decode the individual's algorithm and parameters
    alg, params = decode_individual(individual)

    try:
        # Discover the model based on the algorithm and parameters
        net, im, fm = discover_model(alg, params)

        # Calculate fitness, generalization, and simplicity
        fitness, generalization, simplicity = evaluate_model(net, im, fm, log)

    except Exception as e:
        # Handle cases where discovery or evaluation fails
        print(f"Error evaluating individual: {e}")
        fitness = 0.0
        generalization = 0.0
        simplicity = 0.0

    return fitness, generalization, simplicity  # Return as a tuple for DEAP

In [None]:
# Genetic Algorithm Setup
from deap import creator, base, tools

# Single objective fitness (e.g., fitness only)
creator.create("FitnessSingle", base.Fitness, weights=(1.0,))
creator.create("IndividualSingle", list, fitness=creator.FitnessSingle)

# Multi-objective fitness (e.g., fitness, generalization, simplicity)
creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0, 1.0))
creator.create("IndividualMulti", list, fitness=creator.FitnessMulti)

# Toolbox for single objective
toolbox_single = base.Toolbox()
toolbox_single.register("individual", tools.initCycle, creator.IndividualSingle,
                        (lambda: random.randint(0, len(algos) - 1),  # Algorithm index
                         lambda: random.randint(0, len(alpha_remove_loops_values) - 1),  # Alpha: remove_loops
                         lambda: random.randint(0, len(alpha_ignore_noise_values) - 1),  # Alpha: ignore_noise
                         lambda: random.randint(0, len(heuristic_dep_values) - 1),  # Heuristic: dependency_thresh
                         lambda: random.randint(0, len(heuristic_and_values) - 1),  # Heuristic: and_thresh
                         lambda: random.randint(0, len(heuristic_noise_values) - 1),  # Heuristic: noise_thresh
                         lambda: random.randint(0, len(inductive_noise_values) - 1),  # Inductive: noise_thresh
                         lambda: random.randint(0, len(inductive_variant_values) - 1),  # Inductive: variant
                         lambda: random.randint(0, len(inductive_activity_frequency_values) - 1)),  # Inductive: freq
                        n=1)
toolbox_single.register("population", tools.initRepeat, list, toolbox_single.individual)


In [None]:
# Toolbox for multi-objective
toolbox_multi = base.Toolbox()
toolbox_multi.register("individual", tools.initCycle, creator.IndividualMulti,
                       (lambda: random.randint(0, len(algos) - 1),  # Algorithm index
                        lambda: random.randint(0, len(alpha_remove_loops_values) - 1),  # Alpha: remove_loops
                        lambda: random.randint(0, len(alpha_ignore_noise_values) - 1),  # Alpha: ignore_noise
                        lambda: random.randint(0, len(heuristic_dep_values) - 1),  # Heuristic: dependency_thresh
                        lambda: random.randint(0, len(heuristic_and_values) - 1),  # Heuristic: and_thresh
                        lambda: random.randint(0, len(heuristic_noise_values) - 1),  # Heuristic: noise_thresh
                        lambda: random.randint(0, len(inductive_noise_values) - 1),  # Inductive: noise_thresh
                        lambda: random.randint(0, len(inductive_variant_values) - 1),  # Inductive: variant
                        lambda: random.randint(0, len(inductive_activity_frequency_values) - 1)),  # Inductive: freq
                       n=1)
toolbox_multi.register("population", tools.initRepeat, list, toolbox_multi.individual)

In [None]:
# Register genetic operations
toolbox_single.register("evaluate", evaluate_single_objective)
toolbox_single.register("mate", tools.cxTwoPoint)
toolbox_single.register("mutate", tools.mutUniformInt,
                        low=[0] * 9,
                        up=[len(algos) - 1, len(alpha_remove_loops_values) - 1, len(alpha_ignore_noise_values) - 1,
                            len(heuristic_dep_values) - 1, len(heuristic_and_values) - 1, len(heuristic_noise_values) - 1,
                            len(inductive_noise_values) - 1, len(inductive_variant_values) - 1,
                            len(inductive_activity_frequency_values) - 1],
                        indpb=0.2)
toolbox_single.register("select", tools.selTournament, tournsize=3)

In [None]:
toolbox_multi.register("evaluate", evaluate_multi_objective)
toolbox_multi.register("mate", tools.cxTwoPoint)
toolbox_multi.register("mutate", tools.mutUniformInt,
                       low=[0] * 9,
                       up=[len(algos) - 1, len(alpha_remove_loops_values) - 1, len(alpha_ignore_noise_values) - 1,
                           len(heuristic_dep_values) - 1, len(heuristic_and_values) - 1, len(heuristic_noise_values) - 1,
                           len(inductive_noise_values) - 1, len(inductive_variant_values) - 1,
                           len(inductive_activity_frequency_values) - 1],
                       indpb=0.2)
toolbox_multi.register("select", tools.selNSGA2)  # For multi-objective, NSGA-II selection

## Section D: Single-Objective Optimization (Fitness)

In [None]:
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the log level
    format="%(asctime)s - %(message)s",  # Log format includes timestamp
    handlers=[
        logging.FileHandler("optimization.log"),
        logging.StreamHandler()  # Log to console
    ]
)

In [None]:
def run_single_objective_optimization(time_budget_minutes=15):
    # Initialize population
    pop = toolbox_single.population(n=10)

    # Evolutionary parameters
    NGEN = 1000  # A large number, but we'll stop early based on time budget.
    MU = 20
    LAMBDA = 20
    CXPB = 0.9
    MUTPB = 0.1

    start_time = time.time()
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # EA loop with time budget
    for gen in range(NGEN):
        consumed_time = time.time() - start_time
        if consumed_time > time_budget_minutes * 60:
            break
        logging.info('Consumed Time Till Now: %.2f Minutes', consumed_time / 60)

        offspring = algorithms.varOr(pop, toolbox_single, lambda_=LAMBDA, cxpb=CXPB, mutpb=MUTPB)
        fits = toolbox_single.map(toolbox_single.evaluate, offspring)
        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit

        logging.info('Finished Iteration...')

        pop = toolbox_single.select(offspring, k=MU)
        hof.update(pop)
        record = stats.compile(pop)
        logging.info("Gen: %d, Record: %s", gen, record)

        # Extract the best individual and its hyperparameters
        best_individual = hof[0]
        best_fitness = best_individual.fitness.values[0]  # Single-objective, so one fitness value
        best_hyperparams = decode_individual(best_individual)  # Decode the hyperparameters

        logging.info("Best Fitness in Generation %d: %f", gen, best_fitness)
        logging.info("Best Hyperparameters in Generation %d: %s", gen, best_hyperparams)
        logging.info('Finished Iteration...')

    # Log final best individual
    logging.info("Best individual single objective: %s, Fitness: %s", hof[0], hof[0].fitness.values)
    logging.info("Best hyperparameters: %s", decode_individual(hof[0]))

## Section E: Multi-Objective Optimization (Fitness + Generalization + Simplicity)

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the log level
    format="%(asctime)s - %(message)s",  # Log format includes timestamp
    handlers=[
        logging.FileHandler("multi_objective_optimization.log"),
        logging.StreamHandler()  # Log to console
    ]
)

In [None]:
def run_multi_objective_optimization(time_budget_minutes=15):
    # Initialize population
    pop = toolbox_multi.population(n=20)

    # Evaluate the initial population
    fits = toolbox_multi.map(toolbox_multi.evaluate, pop)
    for ind, fit in zip(pop, fits):
        ind.fitness.values = fit

    # Sort by crowding distance for NSGA-II
    pop = tools.selNSGA2(pop, k=len(pop))

    # Evolutionary parameters
    NGEN = 1000
    MU = 20
    LAMBDA = 20
    CXPB = 0.9
    MUTPB = 0.1

    start_time = time.time()
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)
    hof = tools.HallOfFame(1, similar=lambda x, y: x == y)

    for gen in range(NGEN):
        consumed_time = time.time() - start_time
        if consumed_time > time_budget_minutes * 60:
            break
        logging.info("Consumed Time Till Now: %.2f Minutes", consumed_time / 60)

        offspring = tools.selTournamentDCD(pop, len(pop))
        offspring = [toolbox_multi.clone(ind) for ind in offspring]

        # Variation
        for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox_multi.mate(ind1, ind2)
            toolbox_multi.mutate(ind1)
            toolbox_multi.mutate(ind2)
            del ind1.fitness.values
            del ind2.fitness.values

        # Evaluate invalid individuals
        invalid_inds = [ind for ind in offspring if not ind.fitness.valid]
        fits = toolbox_multi.map(toolbox_multi.evaluate, invalid_inds)
        for fit, ind in zip(fits, invalid_inds):
            ind.fitness.values = fit

        logging.info("Finished an iteration...")

        # Combine and select
        pop = tools.selNSGA2(pop + offspring, k=len(pop))
        hof.update(pop)
        record = stats.compile(pop)

        # Log fitness, generalization, and simplicity
        logging.info("Gen: %d, Record: %s", gen, record)
        best_individual = hof[0]
        fitness, generalization, simplicity = best_individual.fitness.values
        logging.info("Best Fitness: %.6f, Best Generalization: %.6f, Best Simplicity: %.6f", fitness, generalization, simplicity)

    # Log final best individual
    logging.info("Best individual multi objective: %s", hof[0])
    logging.info("Final Best Fitness: %.6f, Best Generalization: %.6f, Best Simplicity: %.6f", 
                 hof[0].fitness.values[0], hof[0].fitness.values[1], hof[0].fitness.values[2])
    print("Final Best Fitness: %.6f, Best Generalization: %.6f, Best Simplicity: %.6f", 
                 hof[0].fitness.values[0], hof[0].fitness.values[1], hof[0].fitness.values[2])


## Section F: Different Time Budgets

In [None]:
# Run single objective with 60 min
#print("Running single objective optimization (Fitness Only) - 120 min")
#run_single_objective_optimization(time_budget_minutes=120)

In [None]:
# Run multi-objective with 60 min
print("Running multi-objective optimization (Fitness+Generalization+Simplicity) - 120 min")
run_multi_objective_optimization(time_budget_minutes=120)