# Towards best practices for predictive process monitoring
## By: Stijn Kas, Ivar Siccama, Peter van der Putten, Georg Krempl, Ruben Post, Sebastiaan Wiewel & Hajo Reijers

This notebook contains the scripts for reproduction of the results from our paper. Performance may be better outside of a jupyter notebook, this document serves as a reference for all experiments which can be executed individually. 

All results are contained in this notebook, and the plots were recreated in RStudio for the final paper version, the script for which can be found in the main folder.

### Imports from river, base python and custom components

In [None]:
from river import compose, preprocessing, stream
from river import tree, ensemble, naive_bayes

import pandas as pd
from collections import defaultdict
from tqdm import tqdm

import numbers
import copy

from OPPM.components import *

### Set directory and (re)set dataset-specific objects

In [None]:
directory = "OPPM/datasets"
dataset = "BPIC_2017"
configs = tools.data_set(dataset, directory)

label = labeling.labeler(
    outcomes = {"O_Cancelled", "O_Accepted", "O_Refused"},
    positive_outcomes = {"O_Accepted"},
    feature = configs['vartypes']['activity_col'],
    positive_label = 1,
    negative_label = 0
)

### Configure model configuration

In [None]:
cat = compose.SelectType(str) | preprocessing.OneHotEncoder()
num = compose.SelectType(numbers.Number) | preprocessing.StandardScaler()
model_prep = (cat+num)

arf = ensemble.AdaptiveRandomForestClassifier(max_depth=15, memory_estimate_period=500, max_size=25)
hat = tree.HoeffdingAdaptiveTreeClassifier(max_depth=15, memory_estimate_period = 500, max_size = 25)
nb = naive_bayes.GaussianNB()

# Experiments

| Experiment 	| Classifier 	| Encoding    	| Time features 	| Rollback 	|
|------------	|------------	|-------------	|---------------	|----------	|
| 1          	| ARF        	| No          	| False         	| False    	|
| 2          	| ARF        	| Subset      	| True          	| False    	|
| 3          	| ARF        	| Last state  	| False         	| False    	|
| 4          	| ARF        	| First state 	| False         	| False    	|
| 5          	| ARF        	| Last state  	| True          	| False    	|
| 6          	| ARF        	| First state 	| True          	| False    	|
| 7          	| ARF        	| Subset      	| True          	| True     	|
| 8          	| ARF        	| Last state  	| True          	| True     	|
| 9          	| HAT        	| Subset      	| True          	| False    	|
| 10         	| HAT        	| Subset      	| True          	| True     	|
| 11         	| GNB        	| Subset      	| True          	| False    	|
| 12         	| GNB        	| Subset      	| True          	| True     	|
| 13         	| GNB        	| Last state  	| True          	| False    	|
| 14         	| GNB        	| Last state  	| True          	| True     	|
| 15         	| GNB        	| First state 	| True          	| False    	|

#### Experiment 1:
#### Baseline: Adaptive Random Forest with no encoding, timefeatures or rollback
Also a quick way to export seen list for analysis with true label


In [None]:
model = copy.deepcopy(model_prep) | arf
preds, seen = defaultdict(list), defaultdict(dict)
data = stream.iter_csv(f"{directory}/{dataset}.csv", **configs['params'])

for x, _ in tqdm(data):
    case_id = x[configs['vartypes']['case_id_col']]
    if case_id not in seen.keys():
        if label.check(x):
            y = label.get(x)
            model.learn_one(x, y)
            seen.update({case_id:y})
        else:
            preds[case_id].append(model.predict_proba_one(x)[label.positive_label])

# Uncomment the next line(s) to write results to csv
# pd.DataFrame.from_dict(seen, orient='index').to_csv("results/00_seen.csv")
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/01_baseline.csv")

#### Experiment 2:
#### Adaptive Random Forest
- Encoding: Custom subset
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/02_ARF_subset.csv")

#### Experiment 3:
#### Adaptive Random Forest
- Encoding: Last state only 
- Time features: False
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_last")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=False, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/03_ARF_last.csv")

#### Experiment 4:
#### Adaptive Random Forest
- Encoding: First state only
- Time features: False
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_first")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=False, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/04_ARF_first.csv")

#### Experiment 5:
#### Adaptive Random Forest
- Encoding: Last state only
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_last")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/05_ARF_last_time.csv")

#### Experiment 6:
#### Adaptive Random Forest
- Encoding: First state only
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_first")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/06_ARF_first_time.csv")

In [None]:
pd.DataFrame.from_dict(preds, orient='index').to_csv("results/98_ARF_first_time_2.csv")

#### Experiment 7:
#### Adaptive Random Forest
- Encoding: Custom subset
- Time features: True
- Rollback: True

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=True)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/07_ARF_subset_rollback.csv")

#### Experiment 8:
#### Adaptive Random Forest
- Encoding: Last state
- Time features: True
- Rollback: True

In [None]:
model = copy.deepcopy(model_prep) | arf
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_last")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=True)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/08_ARF_last_time_rollback.csv")

#### Experiment 9:
#### Hoeffding Adaptive Tree
- Encoding: Custom subset
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | hat
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/09_HAT_subset.csv")

#### Experiment 10:
#### Hoeffding Adaptive Tree
- Encoding: Custom subset
- Time features: True
- Rollback: True

In [None]:
model = copy.deepcopy(model_prep) | hat
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=True)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/10_HAT_subset_rollback.csv")

#### Experiment 11:
#### Gaussian Naive Bayes
- Encoding: Custom subset
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | gnb
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/11_GNB_subset.csv")

#### Experiment 12:
#### Gaussian Naive Bayes
- Encoding: Custom subset
- Time features: True
- Rollback: True

In [None]:
model = copy.deepcopy(model_prep) | gnb
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_subset")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=True)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/12_GNB_subset_rollback.csv")

#### Experiment 13:
#### Gaussian Naive Bayes
- Encoding: Last state
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | gnb
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_last")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/13_GNB_last_time.csv")

#### Experiment 14:
#### Gaussian Naive Bayes
- Encoding: Last state
- Time features: True
- Rollback: True

In [None]:
model = copy.deepcopy(model_prep) | gnb
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_last")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=True)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/14_GNB_last_rollback.csv")

#### Experiment 15:
#### Gaussian Naive Bayes
- Encoding: First state
- Time features: True
- Rollback: False

In [None]:
model = copy.deepcopy(model_prep) | gnb
data, configs, timefeatures, encodings = tools.reset("BPIC_2017", directory, "BPIC_2017_first")

preds, seen, model = pipeline.OPPM(data, configs, label, model, encodings, timefeatures=timefeatures, rollback=False)

# Uncomment the next line to write results to csv
# pd.DataFrame.from_dict(preds, orient='index').to_csv("results/15_GNB_first_time.csv")