In [52]:
import logging , os
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
from ocpa.objects.log.importer.csv import factory as csv_import_factory
from ocpa.algo.predictive_monitoring import factory as predictive_monitoring

from preprocessing import *
from utils import *
from model import CustomPipeline, train_loop, evaluate
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from torch import optim
from torch_geometric.loader import DataLoader

In [53]:
# Use this function to transform the ocel into csv format (supported extensions "xml", "jsonocel", "sqlite")
# ocel_to_csv(file_path, new_file_path, file_type)

In [59]:
# Target functions
reg_funcs = [remaining_time]
class_funcs = [num_events, num_objects, num_unique_acts]
# Create result dataframe
task_names = [f.__name__ for f in reg_funcs] + [f.__name__ for f in class_funcs]
# Set the value of no_feats to:
# - True to use the graph structure only as input
# - False to use event-level features detailed in main_function
no_feat = False
# List of subgraph sizes to use
ks = range(3, 15)
# Graph embedding module to use
embedding_name_list = ["GAT", "GCN", "GraphTransformer", "Graph2Vec", "FGSD"]
embedding_name = "GAT"
# Predictors to use
regressor = LinearRegression
classifier = LogisticRegression
# Model hyperparameters
embedding_size = 8
batch_size = 128
learning_rate = 0.01
mlp_hidden_dim = 16
mlp_num_layers = 2
# OCEL name and object types to use
filename = "recruiting-ocel1"
object_types = ['applicants', 'applications', 'offers']
parameters = {"obj_names": object_types,
              "val_names": [],
              "act_name": "activity",
              "time_name": "timestamp",
              "sep": ","}

#### OCEL file import


In [60]:
print(f"Importing OCEL {filename}")
file_path = os.path.join('.', 'ocel', 'csv', filename)
ocel = csv_import_factory.apply(file_path=file_path + '.csv', parameters=parameters)
print("OCEL Imported successfully")

Importing OCEL recruiting-ocel1
OCEL Imported successfully


#### Feature and target definition

In [61]:
activities = list(set(ocel.log.log["event_activity"].tolist()))

# Event level targets
event_target_set = [(predictive_monitoring.EVENT_REMAINING_TIME, ())]

# Execution level targets (Outcome related)
execution_target_set = [(predictive_monitoring.EXECUTION_NUM_OBJECT, ()),
                        (predictive_monitoring.EXECUTION_NUM_OF_EVENTS, ()),
                        (predictive_monitoring.EXECUTION_UNIQUE_ACTIVITIES, ())]
if no_feat:
    execution_feature_set = []
    event_feature_set = []
else:
    # Event level features
    event_feature_set = [(predictive_monitoring.EVENT_ELAPSED_TIME, ()),
                         (predictive_monitoring.EVENT_NUM_OF_OBJECTS, ())] + \
                        [(predictive_monitoring.EVENT_ACTIVITY, (act,)) for act in activities]
    # [(predictive_monitoring.EVENT_PRECEDING_ACTIVITIES, (act,)) for act in activities] + \
    execution_feature_set = []

#### Process Execution Extraction

In [62]:
PE_nx = get_process_executions_nx(ocel, event_feature_set, event_target_set, execution_feature_set,execution_target_set)

Applying feature extraction to process executions


100%|██████████| 661/661 [00:00<00:00, 330327.05it/s]


#### Preprocessing process executions

In [49]:
# Choose k
k = ks[0]

In [50]:
# Script to build the generate the input data and 
print(f"Using subgraphs of length {k}")
subgraph_list = get_subgraphs_labeled(PE_nx, k=k, subg_funcs=reg_funcs,
                                      g_funcs=class_funcs)
pred_types = get_pred_types(subgraph_list, reg_funcs, class_funcs, task_names)

# Create result dataframes and corresponding dirs
create_res_dfs(filename, task_names, k)
# Scale numerical targets and encode categorical targets
scaler = scale_encode_targets(subgraph_list, pred_types)

# Prepare input data and scale temporal feature
idx_feats_to_scale = [0]
input_dataset = generate_matrix_dataset(subgraph_list, idx_feats_to_scale, k, no_feat)
num_features = input_dataset[0].x.shape[1]

# train val test split
temp_data, test_data = train_test_split(input_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(temp_data, test_size=0.2, random_state=42)
# data loaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

Using subgraphs of length 3
Using subgraphs of length 4
Using subgraphs of length 5
Using subgraphs of length 6
Using subgraphs of length 7
Using subgraphs of length 8
Using subgraphs of length 9
Using subgraphs of length 10
Using subgraphs of length 11
Using subgraphs of length 12
Using subgraphs of length 13
Using subgraphs of length 14


#### Model Construction

In [9]:
# Build Embedding component
embedding_layer = init_embedding_component(embedding_name, num_features, embedding_size, train_data)
# Build Predictor component
nn_preds, ml_preds = init_pred_component(regressor, classifier, embedding_size, mlp_hidden_dim, mlp_num_layers, pred_types)

# Init pipeline
model = CustomPipeline(embedding_layer, nn_preds, ml_preds)
# Init optimizer
optimizer = optim.Adam(model.parameters(), learning_rate)

#### Training

In [10]:
print("Training...")
train_loop(model, optimizer, train_loader, val_loader, pred_types, num_epochs=20)

Training...


### Evaluation

In [11]:
# Results storage creation
result_dfs = []
result_dir_paths = []
for task_name in task_names:
    result_df = pd.DataFrame(columns=['prediction_layer', 'score'])
    result_dfs.append(result_df)
    result_dir_path = os.path.join('.', 'results', filename, task_name, str(k))
    result_dir_paths.append(result_dir_path)
    os.makedirs(result_dir_path, exist_ok=True)

In [12]:
# Tests on linear ml models
_, test_ml_scores = evaluate(model, test_loader, pred_types, train_loader)
# Store test scores for nn and ml
for i, result_df in enumerate(result_dfs):
    if pred_types[i] is None:
        b = scaler.inverse_transform(test_ml_scores[i].reshape(-1, 1))[0][0]
        res_row = [f"{regressor.__name__} models", b / 86400]
        result_df.loc[len(result_df)] = res_row
    else:
        res_row = [f"{classifier.__name__} models", test_ml_scores[i]]
        result_df.loc[len(result_df)] = res_row
for i, result_df in enumerate(result_dfs):
    print(task_names[i])
    print(result_df)

remaining_time
          prediction_layer      score
0  LinearRegression models  17.763668
num_events
            prediction_layer     score
0  LogisticRegression models  0.604003
num_objects
            prediction_layer     score
0  LogisticRegression models  0.369887
num_unique_acts
            prediction_layer     score
0  LogisticRegression models  0.597911


#### Evaluate on other predictors

In [13]:
new_regressor = RandomForestRegressor
new_classifier = RandomForestClassifier
new_predictors = set_new_ml_preds(new_regressor, new_classifier, pred_types)

In [14]:
 _, test_ml_scores = evaluate(model, test_loader, pred_types, train_loader, new_ml_predictors=new_predictors)
for i, result_df in enumerate(result_dfs):
    if pred_types[i] is None:
        b = scaler.inverse_transform(test_ml_scores[i].reshape(-1, 1))[0][0]
        res_row = [f"{new_regressor.__name__} models", b / 86400]
        result_df.loc[len(result_df)] = res_row
    else:
        res_row = [f"{new_regressor.__name__} models", test_ml_scores[i]]
        result_df.loc[len(result_df)] = res_row
for i, result_df in enumerate(result_dfs):
    print(task_names[i])
    print(result_df)

remaining_time
               prediction_layer      score
0       LinearRegression models  17.763668
1  RandomForestRegressor models  18.433195
num_events
               prediction_layer     score
0     LogisticRegression models  0.604003
1  RandomForestRegressor models  0.561358
num_objects
               prediction_layer     score
0     LogisticRegression models  0.369887
1  RandomForestRegressor models  0.332463
num_unique_acts
               prediction_layer     score
0     LogisticRegression models  0.597911
1  RandomForestRegressor models  0.542211


### Scores storage

In [15]:
for i, result_df in enumerate(result_dfs):
    result_file_path = os.path.join(result_dir_paths[i], f"{embedding_name}_{embedding_size}_{no_feat}.csv")
    result_df.to_csv(result_file_path, mode='a')