## Section A: Imports

In [None]:
!pip install pm4py deap

In [None]:
import random
import time
import numpy as np
from deap import base, creator, tools, algorithms
import pm4py
import os
from pm4py.algo.discovery.inductive import variants as im_variants
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive import variants
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.objects.conversion.heuristics_net import converter as hn_converter
from pm4py.algo.filtering.log.variants import variants_filter
from pm4py.algo.filtering.log.attributes import attributes_filter

# For reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
from google.colab import drive
drive.mount('/content/drive')

## Section B: Load Data and Define Evaluation Functions

In [None]:
from pm4py.algo.filtering.log.start_activities import start_activities_filter
from pm4py.algo.filtering.log.end_activities import end_activities_filter

def filter_log_by_start_and_end(log, start_activity, end_activity):
    """
    Filter a log to keep only traces that start with `start_activity` and end with `end_activity`.

    :param log: The input event log (PM4Py log object)
    :param start_activity: The required starting activity (string)
    :param end_activity: The required ending activity (string)
    :return: A filtered log with only the desired traces
    """
    # Filter traces that start with the specified activity
    log_start_filtered = start_activities_filter.apply(log, {start_activity})

    # Filter traces that end with the specified activity
    log_filtered = end_activities_filter.apply(log_start_filtered, {end_activity})

    return log_filtered

In [None]:
# Assume we have a log file path
# EVENT_LOG_PATH = "../data/PermitLog_Sample.xes"  # Update this with the actual path
# EVENT_LOG_PATH = "/content/drive/MyDrive/Masters & PhD/PhD/05_Courses/01_AutoML_Course/Project 2/PermitLog_Sample.xes"  # Update this with the actual path
EVENT_LOG_PATH = "/content/drive/MyDrive/Masters & PhD/PhD/05_Courses/01_AutoML_Course/Project 2/PermitLog.xes"  # Update this with the actual path

# Load the event log
log = pm4py.read_xes(EVENT_LOG_PATH)
log = filter_log_by_start_and_end(log, "Permit SUBMITTED by EMPLOYEE", "Payment Handled")
print(len(log))

def discover_model(algorithm, params):
    if algorithm == "alpha":
        from pm4py.algo.discovery.alpha import algorithm as alpha_miner
        net, im, fm = alpha_miner.apply(log)
        return net, im, fm
    elif algorithm == "heuristic":
        from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
        dep_thresh = params.get("dependency_thresh", 0.5)
        and_thresh = params.get("and_thresh", 0.5)
        heu_net = heuristics_miner.apply_heu(
            log,
            parameters={
                heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: dep_thresh,
                heuristics_miner.Variants.CLASSIC.value.Parameters.AND_MEASURE_THRESH: and_thresh
            }
        )
        # Convert the HeuristicsNet to Petri net
        net, im, fm = hn_converter.apply(heu_net)
        return net, im, fm
    elif algorithm == "inductive":
        from pm4py.algo.discovery.inductive import algorithm as inductive_miner
        from pm4py.algo.discovery.inductive import variants
        from pm4py.objects.conversion.process_tree import converter as pt_converter

        variant = params.get("variant", variants.imf)
        process_tree = inductive_miner.apply(log, variant=variant)
        net, im, fm = pt_converter.apply(process_tree, variant=pt_converter.Variants.TO_PETRI_NET)
        return net, im, fm
    else:
        raise ValueError("Unknown algorithm specified.")


In [None]:
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
from pm4py.algo.evaluation.generalization import algorithm as generalization_algorithm
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_algorithm

def evaluate_model(net, im, fm, log):
    """
    Evaluate the discovered model using fitness, generalization, and simplicity.
    """
    fitness_res = token_replay.apply(log, net, im, fm)
    # Calculate average fitness from trace fitness values
    if isinstance(fitness_res, list):
        total_fitness = sum(res["trace_fitness"] for res in fitness_res if "trace_fitness" in res)
        num_traces = len(fitness_res)
        fitness = total_fitness / num_traces if num_traces > 0 else 0.0
    else:
        fitness = 0.0
    # Generalization evaluation (using token-based variant)
    generalization_value = generalization_algorithm.apply(log, net, im, fm)#, variant=generalization_algorithm.Variants.TOKEN_BASED)
    # Simplicity evaluation
    simplicity_value = simplicity_algorithm.apply(net)
    return fitness, generalization_value, simplicity_value

## Section C: Hyper-parameter Optimization Setup

In [None]:
# Algorithm index: 0-alpha, 1-heuristic, 2-inductive
algos = ["alpha", "heuristic", "inductive"]
inductive_variants = ["im", "imd", "imf"]

creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0, 1.0))  # multi objective
creator.create("Individual", list, fitness=creator.FitnessMulti)

def init_individual():
    # algorithm index: choose randomly among {0,1,2}
    alg_idx = random.randint(0, 2)
    alpha_remove_loops = random.choice([True, False])
    alpha_ignore_noise = random.choice([True, False])
    # For heuristic: continuous range for dep and and threshold, noise
    heuristic_dep = random.uniform(0.0, 0.9)
    heuristic_and = random.uniform(0.0, 0.9)
    heuristic_noise = random.uniform(0.0, 0.5)
    # For inductive: noise, variant index, activity frequency
    inductive_noise = random.uniform(0.0, 0.5)
    inductive_var_idx = random.randint(0, len(inductive_variants)-1)
    inductive_activity_freq = random.uniform(0.0, 1.0)

    individual = creator.Individual([
        alg_idx, alpha_remove_loops, alpha_ignore_noise,
        heuristic_dep, heuristic_and, heuristic_noise,
        inductive_noise, inductive_var_idx, inductive_activity_freq
    ])
    return individual  # Return the created individual

toolbox = base.Toolbox()
toolbox.register("individual", init_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox_multi = base.Toolbox()
toolbox_multi.register("individual", init_individual)
toolbox_multi.register("population", tools.initRepeat, list, toolbox_multi.individual)

In [None]:
def decode_individual(ind):
    alg = algos[int(ind[0])]
    params = {}
    if alg == "alpha":
        params["remove_loops"] = ind[1]
        params["ignore_noise"] = ind[2]
    elif alg == "heuristic":
        params["dependency_thresh"] = ind[3] if len(ind) > 3 else 0.5  # Default value if index out of range
        params["and_thresh"] = ind[4] if len(ind) > 4 else 0.5  # Default value if index out of range
        params["noise_thresh"] = ind[5] if len(ind) > 5 else 0.0  # Default value if index out of range
    elif alg == "inductive":
        params["noise_thresh"] = ind[6] if len(ind) > 6 else 0.0  # Default value if index out of range
        params["variant"] = inductive_variants[ind[7]] if len(ind) > 7 and 0 <= ind[7] < len(inductive_variants) else inductive_variants[0] # Default value if index out of range
        params["activity_frequency_filter"] = ind[8] if len(ind) > 8 else 0.0  # Default value if index out of range
    return alg, params

### Evaluation functions

In [None]:
def evaluate_single_objective(individual):
    """
    Evaluate the individual using a single objective (e.g., fitness).
    """
    alg, params = decode_individual(individual)
    net, im, fm = discover_model(alg, params)
    fitness, _, _ = evaluate_model(net, im, fm, log)
    return (fitness,)

In [None]:
def evaluate_multi_objective(individual):
    """
    Evaluate the individual using multiple objectives (e.g., fitness, generalization, simplicity).
    """
    alg, params = decode_individual(individual)
    net, im, fm = discover_model(alg, params)
    fitness, generalization, simplicity = evaluate_model(net, im, fm, log)
    print('Evaluations: Fitness: ',fitness, ' - Generalization: ', generalization, ' - Simplicity: ', simplicity)
    return fitness, generalization, simplicity 

#### Cross Over and Mutation Functions

In [None]:
toolbox.register("evaluate", evaluate_single_objective)
toolbox_multi.register("evaluate", evaluate_multi_objective)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox_multi.register("mate", tools.cxBlend, alpha=0.5)

def custom_crossover(parent1, parent2, indpb=0.5):
    # For categorical genes (e.g., alg_idx at position 0, booleans at 1,2, variant_idx at 7):
    if random.random() < indpb:
        parent1[0], parent2[0] = parent2[0], parent1[0]
    if random.random() < indpb:
        parent1[1], parent2[1] = parent2[1], parent1[1]
    if random.random() < indpb:
        parent1[2], parent2[2] = parent2[2], parent1[2]
    if random.random() < indpb:
        parent1[7], parent2[7] = parent2[7], parent1[7]

    # For continuous genes (3,4,5,6,8)
    for i in [3,4,5,6,8]:
        if random.random() < indpb:
            alpha = 0.5
            c1 = alpha * parent1[i] + (1 - alpha) * parent2[i]
            c2 = alpha * parent2[i] + (1 - alpha) * parent1[i]
            parent1[i], parent2[i] = c1, c2

    return parent1, parent2

In [None]:
def custom_mutation(ind, indpb=0.2):
    # Continuous indices
    float_indices = [3,4,5,6,8]

    # Mutate continuous parameters
    for i in float_indices:
        if random.random() < indpb:
            if i == 3:  # heuristic_dep in [0,0.9]
                ind[i] = min(0.9, max(0.0, random.gauss(ind[i], 0.1)))
            elif i == 4: # heuristic_and in [0,0.9]
                ind[i] = min(0.9, max(0.0, random.gauss(ind[i], 0.1)))
            elif i == 5: # heuristic_noise in [0,0.5]
                ind[i] = min(0.5, max(0.0, random.gauss(ind[i], 0.05)))
            elif i == 6: # inductive_noise in [0,0.5]
                ind[i] = min(0.5, max(0.0, random.gauss(ind[i], 0.05)))
            elif i == 8: # inductive_activity_freq in [0,1.0]
                ind[i] = min(1.0, max(0.0, random.gauss(ind[i], 0.1)))

    # Mutate categorical parameters:
    # Algorithm index (0, 1, or 2)
    if random.random() < indpb:
        ind[0] = random.choice([0,1,2])

    # Boolean parameters
    if random.random() < indpb:
        ind[1] = not ind[1]
    if random.random() < indpb:
        ind[2] = not ind[2]

    # Inductive variant index
    if random.random() < indpb:
        ind[7] = random.randint(0, len(inductive_variants)-1)

    return (ind,)

In [None]:
toolbox.register("mate", custom_crossover, indpb=0.5)
toolbox.register("mutate", custom_mutation, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

toolbox_multi.register("mate", custom_crossover, indpb=0.5)
toolbox_multi.register("select", tools.selTournament, tournsize=3)

#### Logger

In [None]:
import logging

logging.basicConfig(
    level=logging.INFO,  
    format="%(asctime)s - %(message)s", 
    handlers=[
        logging.FileHandler("multi_optimization_genetic.log"),
        logging.FileHandler("/content/drive/MyDrive/Masters & PhD/PhD/05_Courses/01_AutoML_Course/Project 2/multi_optimization_genetic.log"),
        logging.StreamHandler() 
    ]
)

## Section D: Single-Objective Optimization (Fitness)

In [None]:
def run_single_objective_optimization(time_budget_minutes=60):
    # Initialize population
    pop = toolbox.population(n=10)

    # Evolutionary parameters
    NGEN = 1000  # A large number, but we'll stop early based on time budget.
    MU = 20
    LAMBDA = 20
    CXPB = 0.9
    MUTPB = 0.1

    start_time = time.time()
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    for gen in range(NGEN):
        consumed_time = time.time() - start_time
        if consumed_time > time_budget_minutes * 60:
            break
        print('Consumed Time Till Now: %.2f Minutes', consumed_time / 60)
        logging.info('Consumed Time Till Now: %.2f Minutes', consumed_time / 60)

        offspring = algorithms.varAnd(pop, toolbox, cxpb=CXPB, mutpb=MUTPB)
        fits = toolbox.map(toolbox.evaluate, offspring)
        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit

        logging.info('Finished Iteration...')
        print('Finished Iteration...')

        pop = toolbox.select(offspring, k=len(pop))
        hof.update(pop)
        record = stats.compile(pop)
        logging.info("Gen: %d, Record: %s", gen, record)

        # Extract the best individual and its hyperparameters
        best_individual = hof[0]
        best_fitness = best_individual.fitness.values[0]  # Single-objective, so one fitness value
        best_hyperparams = decode_individual(best_individual)  # Decode the hyperparameters

        logging.info("Best Fitness in Generation %d: %f", gen, best_fitness)
        print("Best Fitness in Generation %d: %f", gen, best_fitness)
        logging.info("Best Hyperparameters in Generation %d: %s", gen, best_hyperparams)
        print("Best Hyperparameters in Generation %d: %s", gen, best_hyperparams)
        logging.info('Finished Iteration...')

    # Log final best individual
    logging.info("Best individual single objective: %s, Fitness: %s", hof[0], hof[0].fitness.values)
    print("Best individual single objective: %s, Fitness: %s", hof[0], hof[0].fitness.values)
    logging.info("Best hyperparameters: %s", decode_individual(hof[0]))
    print("Best hyperparameters: %s", decode_individual(hof[0]))

## Section E: Multi-Objective Optimization (Fitness + Generalization + Simplicity)

In [None]:
def run_multi_objective_optimization(time_budget_minutes=15):
    # Initialize population
    pop = toolbox_multi.population(n=20)
    # Evaluate the initial population and sort by crowding distance
    fits = toolbox_multi.map(toolbox_multi.evaluate, pop) # Get the fitness values for each individual
    for ind, fit in zip(pop, fits): # Iterate through individuals and their corresponding fitness values
        # Evaluate individuals
        ind.fitness.values = fit # Assign the fitness values to the individual's fitness attribute
    pop = tools.selNSGA2(pop, k=len(pop))

    # Evolutionary parameters
    NGEN = 100000
    MU = 20
    LAMBDA = 20
    CXPB = 0.9
    MUTPB = 0.1

    start_time = time.time()
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)
    stats.register("simplicity", lambda pop: np.mean([simplicity_algorithm.apply(discover_model(decode_individual(ind)[0], decode_individual(ind)[1])[0]) for ind in pop]))
    stats.register("generalization", lambda pop: np.mean([generalization_algorithm.apply(log, *discover_model(decode_individual(ind)[0], decode_individual(ind)[1])) for ind in pop]))
    hof = tools.HallOfFame(1, similar=lambda x, y: x == y)

    for gen in range(NGEN):
        consumed_time = time.time() - start_time
        if consumed_time > time_budget_minutes * 60:
            break
        print("Consumed Time Till Now: %.2f Minutes", consumed_time / 60)
        logging.info("Consumed Time Till Now: %.2f Minutes", consumed_time / 60)

        offspring = tools.selTournamentDCD(pop, len(pop))
        offspring = [toolbox_multi.clone(ind) for ind in offspring]

        for ind1, ind2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox_multi.mate(ind1, ind2)
            ind1.fitness.values = (0.0, 0.0, 0.0)
            ind2.fitness.values = (0.0, 0.0, 0.0)

        invalid_inds = [ind for ind in offspring if not ind.fitness.valid]
        fits = toolbox_multi.map(toolbox_multi.evaluate, invalid_inds)
        for fit, ind in zip(fits, invalid_inds):
            ind.fitness.values = fit

        print("Finished an iteration...")
        pop = tools.selNSGA2(pop + offspring, k=len(pop))
        hof.update(pop)
        record = stats.compile(pop)
        logging.info("Gen: %d, Record (fitness, generalization, simplicity): %s", gen, record)
        print("Gen: %d, Record (fitness, generalization, simplicity): %s", gen, record)

    logging.info("Best individual multi objective: %s, Fitness: %s", hof[0], hof[0].fitness.values)
    print("Best individual multi objective: %s, Fitness: %s", hof[0], hof[0].fitness.values)

## Section F: Different Time Budgets

In [None]:
# Run single objective with 60 min
run_single_objective_optimization(time_budget_minutes=60)

In [None]:
# Run multi-objective with 60 min
run_multi_objective_optimization(time_budget_minutes=60)