## CLIP Process Discovery Integration

### Imports

In [78]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import imagehash
from IPython.display import display, clear_output
import ipywidgets as widgets
from ipywidgets import GridBox, Layout
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import torch
from transformers import CLIPProcessor, CLIPModel
import shutil
from datetime import datetime

# TensorFlow related imports
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import numpy as np
import cv2
import os

# PM4Py related imports
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.visualization.bpmn import visualizer as bpmn_visualizer
from pm4py.objects.bpmn.exporter import exporter as bpmn_exporter
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer

import gc
import random


### Lectura fichero

In [3]:
def read_ui_log_as_dataframe(log_path):
  return pd.read_csv(log_path, sep=";")#, index_col=0)

### Extracción de características

In [5]:
def extract_features_from_images(df, image_col, text_col, image_weight, text_weight, img_dir):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    combined_features = []

    for _, row in df.iterrows():
        text = row[text_col]
        # Usa os.path.join para construir la ruta completa de la imagen.
        image_path = os.path.join(img_dir, row[image_col])
        
        # Asegúrate de que la imagen exista, de lo contrario lanza un error.
        if not os.path.exists(image_path):
            raise ValueError(f"La imagen no existe en {image_path}")

        # Abre la imagen usando la ruta completa.
        image = Image.open(image_path)
        inputs = processor(text=[text], images=image, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        image_features = outputs.image_embeds.cpu().numpy().flatten() * image_weight
        text_features = outputs.text_embeds.cpu().numpy().flatten() * text_weight
        
        combined_feature = np.hstack((image_features, text_features))
        combined_features.append(combined_feature)

    df['combined_features'] = combined_features

    return df

In [37]:
def extract_features_from_images_with_tokenizer(df, image_col, text_col, image_weight, text_weight, img_dir, header_txt=False, text_path_col="header_txt"):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    combined_features = []

    for _, row in df.iterrows():
        if header_txt:
            txt_path = os.path.join("logs/invoice_def", "ocr_results", row[text_path_col])
            if not os.path.exists(txt_path):
                raise FileNotFoundError(f"El archivo de texto no existe: {txt_path}")
            with open(txt_path, 'r') as file:
                text = file.read()
        else:
            text = row[text_col]

        print(text)
        # Tokeniza el texto
        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=77)
        text = tokenizer.decode(input_ids[0], skip_special_tokens=True)


        # Construye la ruta completa de la imagen y asegura que exista.
        image_path = os.path.join(img_dir, row[image_col])
        if not os.path.exists(image_path):
            raise ValueError(f"La imagen no existe en {image_path}")

        # Abre la imagen y procesa junto con el texto tokenizado.
        image = Image.open(image_path)
        inputs = processor(text=[text], images=[image], return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        # Combina y pondera las características extraídas.
        image_features = outputs.image_embeds.cpu().numpy().flatten() * image_weight
        text_features = outputs.text_embeds.cpu().numpy().flatten() * text_weight
        combined_feature = np.hstack((image_features, text_features))
        combined_features.append(combined_feature)

    df['combined_features'] = combined_features
    return df

### Clusterización

In [6]:
def cluster_images(df, n_clusters_range, use_pca, n_components):
    features = np.array(df['combined_features'].tolist())
    
    if use_pca:
        pca = PCA(n_components=n_components)
        features = pca.fit_transform(features)
        print(f"PCA aplicado: {features.shape[1]} componentes retenidos")

    clustering_scores = {
        'n_clusters': [],
        'silhouette_score': [],
        'davies_bouldin_score': [],
        'calinski_harabasz_score': []
    }

    for k in range(*n_clusters_range):
        clustering = AgglomerativeClustering(n_clusters=k).fit(features)
        labels = clustering.labels_

        clustering_scores['n_clusters'].append(k)
        clustering_scores['silhouette_score'].append(silhouette_score(features, labels))
        clustering_scores['davies_bouldin_score'].append(davies_bouldin_score(features, labels))
        clustering_scores['calinski_harabasz_score'].append(calinski_harabasz_score(features, labels))

    # Encuentra el índice del número óptimo de clústeres basado en la mejor puntuación Silhouette
    optimal_index = np.argmax(clustering_scores['silhouette_score'])
    optimal_clusters = clustering_scores['n_clusters'][optimal_index]

    # Ejecutar el clustering con el número óptimo de clústeres
    best_clustering = AgglomerativeClustering(n_clusters=optimal_clusters).fit(features)
    df['activity_label'] = best_clustering.labels_

    # Obtener las métricas para el número óptimo de clústeres
    optimal_metrics = {
        'silhouette_score': clustering_scores['silhouette_score'][optimal_index],
        'davies_bouldin_score': clustering_scores['davies_bouldin_score'][optimal_index],
        'calinski_harabasz_score': clustering_scores['calinski_harabasz_score'][optimal_index]
    }

    return df, clustering_scores, optimal_clusters, optimal_metrics


### Análisis 

In [7]:
def extraer_caminos(df):
    caminos = df.groupby('process_id')['activity_label'].apply(tuple)
    return caminos

def calcular_metricas(caminos_logs, caminos_apriori, caminos_inicial, caminos_final):
    caminos_logs_set = set(caminos_logs)
    caminos_apriori_set = set(caminos_apriori)
    caminos_inicial_set = set(caminos_inicial)
    caminos_final_set = set(caminos_final)
    
    # Calcular las métricas
    num_paths_apriori = len(caminos_apriori_set)
    num_paths_inicial = len(caminos_inicial_set)
    num_paths_final = len(caminos_final_set)
    
    # Porcentajes de nuevos caminos y caminos no descubiertos
    new_paths = caminos_final_set - caminos_apriori_set
    percent_new = len(new_paths) / num_paths_final if num_paths_final else 0
    
    non_discovered_paths = caminos_apriori_set - caminos_final_set
    percent_non_discovered = len(non_discovered_paths) / num_paths_apriori if num_paths_apriori else 0
    
    return {
        'num_paths_apriori': num_paths_apriori,
        'num_paths_inicial': num_paths_inicial,
        'num_paths_final': num_paths_final,
        'percent_new': percent_new * 100,
        'percent_non_discovered': percent_non_discovered * 100
    }

### Case id allocation

In [8]:
def auto_process_id_assignment(df):
    activity_inicial = df['activity_label'].iloc[0]
    process_id = 1
    process_ids = [process_id]  
    for index, row in df.iterrows():
        if index != 0:  
            if row['activity_label'] == activity_inicial:
                process_id += 1
            process_ids.append(process_id)
        else:
            continue
    df['process_id'] = process_ids
    return df

In [9]:
def eliminar_acciones_duplicadas(df, columna_label='activity_label'):
    mascaras_para_eliminar = df[columna_label].eq(df[columna_label].shift())
    df_limpio = df[~mascaras_para_eliminar]
    
    return df_limpio

### Bpmn / Petrinet

In [10]:
def petri_net_process(df, timestamp_col):
    # DataFrame To EventLog
    formatted_df = pm4py.format_dataframe(df, case_id='process_id', activity_key='activity_label', timestamp_key=timestamp_col)
    event_log = pm4py.convert_to_event_log(formatted_df)

    # Descubrimiento del árbol del proceso
    process_tree = inductive_miner.apply(event_log)
    net, initial_marking, final_marking = pm4py.convert_to_petri_net(process_tree)

    # Métricas
    fitness = replay_fitness_evaluator.apply(event_log, net, initial_marking, final_marking)
    precision = precision_evaluator.apply(event_log, net, initial_marking, final_marking)
    generalization = generalization_evaluator.apply(event_log, net, initial_marking, final_marking)
    simplicity = simplicity_evaluator.apply(net)

    # Guardar resultados
    dot = pn_visualizer.apply(net, initial_marking, final_marking)
    dot_path = os.path.join('results', 'pn.dot')
    with open(dot_path, 'w') as f:
        f.write(dot.source)

    return fitness, precision, generalization, simplicity

def bpmn_process(df, timestamp_col):
    # DataFrame To EventLog
    formatted_df = pm4py.format_dataframe(df, case_id='process_id', activity_key='activity_label', timestamp_key=timestamp_col)
    event_log = pm4py.convert_to_event_log(formatted_df)

    # Descubrimiento del modelo BPMN
    bpmn_model = pm4py.discover_bpmn_inductive(event_log)

    # Guardar resultados
    dot = bpmn_visualizer.apply(bpmn_model)
    dot_path = os.path.join('results', 'bpmn.dot')
    with open(dot_path, 'w') as f:
        f.write(dot.source)
    bpmn_exporter.apply(bpmn_model, os.path.join('results', 'bpmn.bpmn'))

### Case configuration

In [11]:
#invoice def (+1 path 'customer path')
log_path = 'logs/invoice_def/log.csv'
image_col = 'screenshot'
image_dir = 'resources/invoice_def'
text_col = 'header'
timestamp_col = 'timestamp'

### Lanzar / Guardar ejecución

In [60]:
#invoice customer path
caminos_apriori = ((7,4,2,3,1,0), (7,4,6,5), (7,4,6,8))
caminos_apriori_series = pd.Series(list(caminos_apriori))

In [82]:
# Configuraciones iniciales
model = 'clip'
n_clusters_range = (2, 11)
n_components = 0.95
use_pca = False
tokeniza = True #¿Tokenizamos?
header_txt = True #¿Usamos el texto completo?

# Directorio principal para los casos de estudio
case_study_name = "token_full" 
root_dir = os.path.join("executions", case_study_name)
os.makedirs(root_dir, exist_ok=True)

results = []

# Información de las ejecuciones a realizar
executions = [
    {'exec': 1, 'image_weight': 1, 'text_weight': 0},
    {'exec': 2, 'image_weight': 0.8, 'text_weight': 0.2},
    {'exec': 3, 'image_weight': 0.6, 'text_weight': 0.4},
    {'exec': 4, 'image_weight': 0.5, 'text_weight': 0.5},
    {'exec': 5, 'image_weight': 0.4, 'text_weight': 0.6},
    {'exec': 6, 'image_weight': 0.2, 'text_weight': 0.8},
    {'exec': 7, 'image_weight': 0, 'text_weight': 1}
]



In [83]:
def overwrite_csv(df, file_path):
    """Escribe un DataFrame a un archivo CSV, sobrescribiendo el archivo existente."""
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
        df.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error al escribir el archivo CSV: {e}")

def move_and_overwrite(source, destination):
    """Mueve un archivo de una ubicación a otra y lo sobrescribe si ya existe."""
    if os.path.exists(destination):
        os.remove(destination)
    shutil.move(source, destination)
    
def clear_caches():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def load_fresh_data():
    return read_ui_log_as_dataframe(log_path)

In [84]:
for exec in executions:

    df = read_ui_log_as_dataframe(log_path)
    clear_caches()  

    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
        
    exec_dir = f"{case_study_name}_{exec['image_weight']}_{exec['text_weight']}"
    exec_path = os.path.join(root_dir, exec_dir)
    os.makedirs(exec_path, exist_ok=True)

    image_weight = exec['image_weight']
    text_weight = exec['text_weight']
    
    if tokeniza:
        df = extract_features_from_images_with_tokenizer(df, image_col, text_col, image_weight, text_weight, image_dir, header_txt, text_path_col='header_txt')
    else:
        df = extract_features_from_images(df, image_col, text_col, image_weight, text_weight, image_dir)
    
    df, clustering_scores, optimal_clusters, optimal_metrics = cluster_images(df, n_clusters_range, use_pca, n_components)

    df = auto_process_id_assignment(df)
    caminos_inicial = extraer_caminos(df)
    df = eliminar_acciones_duplicadas(df, columna_label='activity_label')

    petri_net_process(df, timestamp_col)
    bpmn_process(df, timestamp_col)

    df.to_csv(os.path.join(exec_path, 'df.csv'), index=False)
    move_and_overwrite('results/pn.dot', os.path.join(exec_path, 'pn.dot'))
    move_and_overwrite('results/bpmn.dot', os.path.join(exec_path, 'bpmn.dot'))
    move_and_overwrite('results/bpmn.bpmn', os.path.join(exec_path, 'bpmn.bpmn'))

    caminos_final = extraer_caminos(df)
    caminos_inicial_set = set(caminos_inicial)
    caminos_final_set = set(caminos_final.apply(tuple))
    caminos_apriori_set = set(caminos_apriori_series.apply(tuple))
    caminos_nuevos = caminos_final_set - caminos_apriori_set
    caminos_no_descubiertos = caminos_apriori_set - caminos_final_set
    porcentaje_nuevos = (len(caminos_nuevos) / len(caminos_final_set)) * 100 if caminos_final_set else 0
    porcentaje_no_descubiertos = (len(caminos_no_descubiertos) / len(caminos_apriori_set)) * 100 if caminos_apriori_set else 0

    with open(os.path.join(exec_path, 'caminos_stats.txt'), 'w') as file:
        file.write(f"Descubrimiento de caminos\n")
        file.write(f"Pesos utilizados - Peso de imagen: {image_weight}, Peso de texto: {text_weight}\n")
        file.write(f"Caminos a priori: {caminos_apriori}\n")
        file.write(f"Caminos iniciales: {caminos_inicial_set}\n")
        file.write(f"Caminos finales: {caminos_final_set}\n")
        file.write(f"Porcentaje de nuevos caminos: {porcentaje_nuevos:.2f}%\n")
        file.write(f"Caminos no descubiertos: {porcentaje_no_descubiertos:.2f}%\n")


    results.append({
        'exec': exec['exec'],
        'image_weight': image_weight,
        'text_weight': text_weight,
        'new%': porcentaje_nuevos,
        'pathNotDisc%': porcentaje_no_descubiertos,
        'Silhouette': optimal_metrics['silhouette_score'],
        'Davies-Bouldin': optimal_metrics['davies_bouldin_score'],
        'Calinski-Harabasz': optimal_metrics['calinski_harabasz_score'],
    })

results_df = pd.DataFrame(results)
overwrite_csv(results_df, os.path.join(root_dir, 'resultados.csv'))


odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120

aligning log, completed variants ::   0%|          | 0/4 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/4 [00:00<?, ?it/s]

odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120

aligning log, completed variants ::   0%|          | 0/3 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/3 [00:00<?, ?it/s]

odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120

aligning log, completed variants ::   0%|          | 0/2 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/2 [00:00<?, ?it/s]

odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120

aligning log, completed variants ::   0%|          | 0/3 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/3 [00:00<?, ?it/s]

odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120

aligning log, completed variants ::   0%|          | 0/5 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/7 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/5 [00:00<?, ?it/s]

odoo discuss c locahostobnvebiaction selcidse ida76 d sn z elmenu discuss inbox mark all read start meeting inbox starred d history channels general direct messages congratulations inbox is your empty new here messag es appear
odoo invoices locahostobnvebiaction 2a2elmodela id4116 d sj accountmoveblvew istalcidse elmenu type invoicing customers vendors reporting configuration 19 new upload invoices searche number customer date due date activities tax excluded total total currency payment status invoice in invrozaooos azure interior 04112024 49 days s 8500 s 27025 s 27025 not paid posted in invnozaoooo 041111024 49 s 73500 s 8as25 s 84525 posted azure interior in days not paid 4801250 4801250 invrolaoooo deco addict qafozn024 call s4175000 s paid posted invinoqaioooo deco addict 0aj0612024 3 days followup s 1925000 s2215750 s 2218750 not paid posted ago payment on invroqaiooooo 04j011024 19 158000 181700 181700 deco addict in days s s s not paid posted invrolaoooot azure interior af0120