First, we need to import all necessary components.

In [1]:
import pickle
from src.ats import *
import pandas as pd
from src.print_ats import *
from src.input_data import InputData
from src.custom_models import Average, Minimum, Maximum, SampleMean, Median, Mode
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    ElasticNetCV,
)
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from src.helper import print_progress_bar


Before we can make our model, we need to apply pre-pocessing first. This can be done via the inputData class that is dedicated for the given data. Many of the well-known pre-processing functionality is implemented. 

In [None]:
ALL_DATE_COLS = [
    "ActivityTimeStamp",
    "Open Time",
    "Reopen Time",
    "Resolved Time",
    "Close Time",
]

input = InputData("incidentProcess_custom.csv")
input.apply_standard_preprocessing(
    agg_col="rem_time",  # calculated y column
    filter_incompletes=True, #Only complete traces (i.e., that start and finish in the given data set) are included
    dropna=False,
    date_cols=ALL_DATE_COLS,  # list of cols that must be transformed into unix time. If empty / not given, nothing will be transformed
)

# # columns that have have ordinal values are label encoded
# input.use_cat_encoding_on("label", ["Category", "Activity"])

# columns with too many categories are deleted
input.use_cat_encoding_on(
    "none",
    [
        "Service Affected",
        "Asset SubType Affected",
        "Service Caused",
        "Assignment Group",
        "Priority", "Asset Type Affected", "Status", "Closure Code",
        "Asset Caused","Asset Type Caused","Asset SubType Caused"
    ],
)

# split function that keeps traces together
X_train, X_test, y_train, y_test = input.train_test_split_on_trace(
    y_col="RemainingTime", ratio=0.8
)

# The previous event attribute must be added in order to navigate through the ATS.
X_test = input.add_prev_events(X_test)

Let's build our model.

In [None]:
ats = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col="RemainingTime",
    representation="multiset",
    horizon=1,
    model=HistGradientBoostingRegressor()
)

ats.fit(X_train, y_train)
ats.finalize()

We can now test the accuracy of our model!

In [None]:
diff = 0.0

print_progress_bar(
    0, len(y_test), prefix="Prediction:", suffix="Complete", length=50
)


for i, event in enumerate(X_test.to_dict(orient="records")):

    y_pred = ats.predict(event)

    diff += abs(y_pred - y_test.iloc[i])

    print_progress_bar(
        i+1, len(y_test), prefix="Prediction:", suffix="Complete", length=50
    )

diff = diff / len(y_test)

print(f"MAE: {round(diff / (60*60))} hours  = {round(diff / (60*60*24))} days") #get difference in hours instead of seconds
