# Discovering Overlapping Rules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from django.shortcuts import get_object_or_404


## Opción 1

## Create flattened log from UI log

In [2]:
# Load the data
log = pd.read_csv('prestamos_log_ejemplo.csv')
log.columns

case_colname = "Caso"
activity_colname = "Actividad"
decision_colname = "Decision"

flattened_log_columns = []
flattened_log_columns = ["Timestamp_start", "Timestamp_end", case_colname, decision_colname]
process_model_columns = []
process_model_columns = ['ID', 'decision_point', activity_colname, decision_colname, case_colname]

# while column case_colname maintains the same values, we will store all info in one row of flattened_log adding as columns variable+"value of the Caso"
for distinct_activity in log[activity_colname].unique():
    for col in log.columns:
        if col != case_colname and (not col in process_model_columns):
            # Create a new column in flattened_log
            flattened_log_columns.append(col + "_" + str(distinct_activity))

flattened_log = pd.DataFrame(columns=flattened_log_columns)

case = log.iloc[0][case_colname]
for i in range(len(log)):
    row = log.iloc[i]
    # new_row = {}
    for col in log.columns:
        if not col in process_model_columns:
            new_row[col + "_" + row[activity_colname]] = row[col]
    if case != row[case_colname]:
        new_row.update({decision_colname: log.iloc[i-1][decision_colname], case_colname: log.iloc[i-1][case_colname]})
        flattened_log = pd.concat([flattened_log, pd.DataFrame(new_row, index=[0])], ignore_index=True)
        new_row = {}
        case = row[case_colname]
        
flattened_log
    


NameError: name 'new_row' is not defined

## Preprocessing flattened log

In [6]:
log = pd.read_csv('../../../../screenrpa-resources/23 Inf. Sys. v2/IS_Rev1_Real_FE2+3/sc_0_size100_Balanced/log.csv')
flattened_log = pd.read_csv('../../../../screenrpa-resources/23 Inf. Sys. v2/IS_Rev1_Real_FE2+3/sc_0_size100_Balanced/decision-tree/flattened_dataset.csv', index_col=0)

case_colname = "Case"
activity_colname = "Activity"
decision_colname = "Variant"

flattened_log_columns = []
flattened_log_columns = ["Timestamp_start", "Timestamp_end", case_colname, decision_colname]
process_model_columns = []
process_model_columns = ['ID', 'decision_point', activity_colname, decision_colname, case_colname]

In [7]:

def def_preprocessor(X):
    # define type of columns
    # sta_columns = list(filter(lambda x:"sta_" in x, X.columns))
    
    # Identificar las columnas con todos los valores iguales
    columns_to_drop = X.columns[X.nunique() == 1]
    # Identificar las columnas con todos los valores nulos
    columns_to_drop = columns_to_drop.union(X.columns[X.isnull().all()])
    # Eliminar las columnas con todos los valores iguales o nulos
    X = X.drop(columns=columns_to_drop)
    
    mapping_dict = {"enabled": ['NaN', 'enabled'], "checked": ['unchecked', 'checked', '']}
    mapping_list = []
    sta_columns = []
    # Identificar las columnas que contienen "sta_" en su nombre
    for col in X.columns:
        if 'sta_' in col:
            sta_columns.append(col)
            if 'enabled' in col:
                mapping_list.append(list(mapping_dict['enabled']))
            elif 'checked' in col:
                mapping_list.append(list(mapping_dict['checked']))
            else:
                raise Exception("Not preprocessed column: " + str(col))
                
    one_hot_columns = list(X.select_dtypes(include=['object']).columns.drop(sta_columns))
    numeric_features = X.select_dtypes(include=['number']).columns

    # create each transformer
    status_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
                                    ('label_encoder', OrdinalEncoder(categories=list(mapping_list)))
                                    ])
    one_hot_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
                                    ('one_hot_encoder', OneHotEncoder())
                                    ])

    numeric_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy='mean')),
                                    ])#('scaler',StandardScaler())

    # create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, numeric_features),
            ('one_hot_categorical', one_hot_transformer, one_hot_columns),
            ('status_categorical', status_transformer, sta_columns)
        ]
    )
    return preprocessor

In [8]:

# Recorro las columnas del flattened_log y me quedo con aquellas cuyo nombre contiene algunos de la lista ls
ls = log.columns.tolist()
for c in ls:
    if c in process_model_columns:
        ls.remove(c)
print("features_colnames: ", ls)
features_colnames = [col for col in flattened_log.columns if any([l in col for l in ls])]
print(features_colnames)


# Elimina columnas que solo contienen valores nulos
cleaned_flattened_log = flattened_log.dropna(axis=1, how='all')

# Elimina columnas donde todos los valores son iguales
for col in cleaned_flattened_log.columns:
    if len(cleaned_flattened_log[col].unique()) == 1:
        cleaned_flattened_log.drop(col, axis=1, inplace=True)
cleaned_flattened_log

features_colnames:  ['Activity', 'Timestamp', 'MorKeyb', 'Coor_X', 'Coor_Y', 'Click', 'features.experiment.GUI_category.name.TextInput', 'NameApp', 'Screenshot']
['MorKeyb_1_A', 'Coor_X_1_A', 'Coor_Y_1_A', 'Click_1_A', 'features.experiment.GUI_category.name.TextInput_1_A', 'NameApp_1_A', 'MorKeyb_2_B', 'Coor_X_2_B', 'Coor_Y_2_B', 'Click_2_B', 'features.experiment.GUI_category.name.TextInput_2_B', 'NameApp_2_B', 'MorKeyb_3_C', 'Coor_X_3_C', 'Coor_Y_3_C', 'Click_3_C', 'features.experiment.GUI_category.name.TextInput_3_C', 'NameApp_3_C', 'features.experiment.GUI_category.name.TextInput_4_D', 'NameApp_4_D']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_flattened_log.drop(col, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_flattened_log.drop(col, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_flattened_log.drop(col, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleane

Unnamed: 0,Variant,Coor_X_1_A,Coor_Y_1_A,Coor_X_3_C,Coor_Y_3_C,sta_enabled_844.5-454.5_1_A,sta_enabled_851.0-572.0_1_A,sta_enabled_840.5-627.5_1_A,sta_enabled_840.5-692.0_1_A,sta_enabled_843.0-745.5_1_A,...,sta_enabled_1101.5-899.0_1_A,sta_enabled_1376.5-1204.0_1_A,sta_enabled_1108.5-1324.5_1_A,sta_enabled_1101.5-1440.5_1_A,sta_enabled_1117.0-1550.5_1_A,sta_enabled_509.5-434.5_2_B,sta_enabled_570.0-558.0_2_B,sta_enabled_304.0-643.0_4_D,sta_enabled_778.0-680.5_4_D,sta_enabled_1149.5-647.0_4_D
1,4,1105,463,653,436,enabled,enabled,enabled,enabled,enabled,...,,,,,,,,,,
2,2,1121,475,476,417,,,,,,...,,,,,,,,,,
3,4,1067,310,435,164,,,,enabled,,...,,,,,,,,,,
4,2,1778,341,479,476,,,,,,...,,,,,,,,,,
5,3,1567,321,752,343,,,,,,...,,,,,,,,,,
6,4,1346,489,366,301,,,,,,...,,,,,,,,,,
7,3,1317,279,642,342,,,,,enabled,...,,,,,,,,,,
8,3,773,328,716,197,,,,,,...,,,,,,,,,,
9,2,1177,344,708,407,,,,,,...,,,,,,,,,,
10,1,767,463,566,450,,,,,,...,,,,,,,,,,


## Train Basic Decision Tree

In [10]:
# Extract features and target variable
X = cleaned_flattened_log.drop(columns=['Variant'])
y = cleaned_flattened_log['Variant']

preprocessor = def_preprocessor(X)
X = preprocessor.fit_transform(X)
X_df = pd.DataFrame(X)
feature_names = list(preprocessor.get_feature_names_out())


# Separar características y etiqueta
features = flattened_log[features_colnames]
labels = flattened_log[decision_colname]

# Entrenar un árbol de decisión
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_df, y)

print(dt_classifier)

# Imprimir el árbol de decisión
decision_tree_rules = export_text(dt_classifier, feature_names=feature_names)
print(decision_tree_rules)

# Supongamos que esta parte analiza las instancias mal clasificadas para crear reglas superpuestas,
# En un caso real, este paso necesitaría más detalles según el enfoque específico del documento y los datos del log.

# Nota: El código real para identificar reglas superpuestas sería más complejo e involucraría la construcción y análisis de múltiples árboles de decisión.


DecisionTreeClassifier()
|--- numeric__qua2_Checkbox_unchecked_4_D <= 0.50
|   |--- numeric__qua2_Button_2_B <= 6.50
|   |   |--- class: 3
|   |--- numeric__qua2_Button_2_B >  6.50
|   |   |--- class: 1
|--- numeric__qua2_Checkbox_unchecked_4_D >  0.50
|   |--- numeric__qua2_TextView_2_B <= 25.00
|   |   |--- class: 4
|   |--- numeric__qua2_TextView_2_B >  25.00
|   |   |--- class: 2



In [None]:
def print_decision_tree(dt, feature_names):
    tree_rules = export_text(dt, feature_names=feature_names)
    print(tree_rules)

In [None]:


# Función para entrenar un árbol de decisión y devolver las instancias mal clasificadas
def train_tree_and_find_misclassified(features, labels):
    dt = DecisionTreeClassifier(random_state=0, max_depth=5)
    dt.fit(features, labels)
    print("Árbol de decisión inicial:")
    print_decision_tree(dt, features.columns.tolist())
    predictions = dt.predict(features)
    misclassified = features[labels != predictions]
    return dt, misclassified

# Función para crear reglas superpuestas a partir de instancias mal clasificadas
def create_overlapping_rules(features, labels, misclassified):
    # overlapping_rules = []
    # for index, row in misclassified.iterrows():
    # specific_features = features.loc[[index]]
    # specific_label = labels.loc[[index]]
    dt_subtree = DecisionTreeClassifier(random_state=0, max_depth=5)  # Limitar la profundidad para simplificar
    # dt_subtree.fit(specific_features, specific_label)
    dt_subtree.fit(features, labels)
    print("Árbol de decisión para la instancia mal clasificada:")
    print_decision_tree(dt_subtree, features.columns.tolist())
    rule = export_text(dt_subtree, feature_names=list(features.columns))
    # overlapping_rules.append(rule)
    # return overlapping_rules
    return rule

# Entrenar el árbol de decisión inicial y encontrar instancias mal clasificadas
initial_tree, misclassified_instances = train_tree_and_find_misclassified(features, labels)

print(misclassified_instances)

# Crear reglas superpuestas a partir de las instancias mal clasificadas
overlapping_rules = create_overlapping_rules(features, labels, misclassified_instances)

# Imprimir las reglas superpuestas
# for rule in overlapping_rules:
#     print("Regla superpuesta:", rule)


Árbol de decisión inicial:
|--- IngresoMensual_AnálisisCrediticio <= 5345.50
|   |--- IngresoMensual_DecisiónPréstamo <= 3166.50
|   |   |--- IngresoMensual_RevisiónDocumentación <= 5553.00
|   |   |   |--- IngresoMensual_DecisiónPréstamo <= 2080.00
|   |   |   |   |--- IngresoMensual_AnálisisCrediticio <= 2722.00
|   |   |   |   |   |--- class: Rechazado
|   |   |   |   |--- IngresoMensual_AnálisisCrediticio >  2722.00
|   |   |   |   |   |--- class: Aprobado
|   |   |   |--- IngresoMensual_DecisiónPréstamo >  2080.00
|   |   |   |   |--- class: Aprobado
|   |   |--- IngresoMensual_RevisiónDocumentación >  5553.00
|   |   |   |--- MontoSolicitado_DecisiónPréstamo <= 17500.00
|   |   |   |   |--- IngresoMensual_DecisiónPréstamo <= 3043.00
|   |   |   |   |   |--- class: Rechazado
|   |   |   |   |--- IngresoMensual_DecisiónPréstamo >  3043.00
|   |   |   |   |   |--- class: Aprobado
|   |   |   |--- MontoSolicitado_DecisiónPréstamo >  17500.00
|   |   |   |   |--- class: Aprobado
|   |

# Opción 2

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Supongamos que este es tu archivo CSV y 'decision_point' es una columna que indica el punto de decisión.
# data = pd.read_csv('log_overlapping.csv')
data = pd.read_csv('log_overlapping_v2.csv')

# Aquí dividimos los datos basados en puntos de decisión, cada uno puede ser un proceso diferente.
# En un caso real, ajusta 'decision_point' a tu caso específico.
for decision_point in data['decision_point'].unique():
    dp_data = data[data['decision_point'] == decision_point]
    # get dummies para las variables categóricas
    X = pd.get_dummies(dp_data.drop(['decision', 'decision_point'], axis=1))
    # X = pd.get_dummies(dp_data.drop(['decision', 'decision_point', 'ID', 'Actividad'], axis=1))
    feature_names = X.columns
    y = dp_data['decision']

    # Construir y entrenar el árbol de decisión inicial
    initial_tree = DecisionTreeClassifier(random_state=42)
    initial_tree.fit(X, y)
    
    # Identificar instancias mal clasificadas y construir árboles secundarios
    secondary_trees = {}
    for i, leaf in enumerate(initial_tree.apply(X)):
        if y.values[i] != initial_tree.predict(X.iloc[[i]]):
            # Esto identifica una instancia mal clasificada
            if leaf not in secondary_trees:
                # Creamos un nuevo árbol de decisión para este folio mal clasificado
                secondary_trees[leaf] = DecisionTreeClassifier(random_state=42)
            # Entrenamos el árbol secundario solo con instancias mal clasificadas
            secondary_trees[leaf].fit(X.iloc[[i]], y.iloc[[i]])

    # Ahora tienes un árbol inicial y múltiples árboles secundarios para las instancias mal clasificadas.
    # La combinación de reglas de estos árboles formaría tu lógica de decisión superpuesta.
    # Nota: La 'combinación' real de reglas dependerá de tu necesidad específica y puede ser compleja.
    # Esto es solo un esquema del proceso.
    print(f"Decision Point: {decision_point}")
    print(f"Initial Tree Accuracy: {accuracy_score(y, initial_tree.predict(X))}")
    # Imprimir el árbol de decisión
    print_decision_tree(initial_tree, feature_names.tolist())
    
    for leaf, tree in secondary_trees.items():
        print(f"Secondary Tree for leaf {leaf} Accuracy: {accuracy_score(y, tree.predict(X))}")
    
        # Imprimir el árbol de decisión
        print_decision_tree(secondary_trees[leaf], feature_names.tolist())
    
    
    # Asumiendo que quieres combinar las reglas, aquí es donde lo harías.
    # Ten en cuenta que esto es conceptual; necesitarías decidir cómo combinar estas reglas en la práctica.

    # Nota: Este código es muy básico y conceptual. La implementación completa y la combinación de reglas
    # son mucho más complejas y deben adaptarse a tus necesidades específicas.


Decision Point: 1
Initial Tree Accuracy: 1.0
|--- status_paid <= 0.50
|   |--- amount <= 175.00
|   |   |--- class: Payment
|   |--- amount >  175.00
|   |   |--- class: Send Fine
|--- status_paid >  0.50
|   |--- class: Close Fine

Decision Point: 2
Initial Tree Accuracy: 1.0
|--- status_approved <= 0.50
|   |--- class: Reject Loan
|--- status_approved >  0.50
|   |--- class: Approve Loan

