First, we need to import all necessary components.

In [21]:
import pickle
from src.ats import *
import pandas as pd
from src.print_ats import *
from src.input_data import InputData
from src.custom_models import Average, Minimum, Maximum, SampleMean, Median, Mode
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    ElasticNetCV,
)
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from src.helper import print_progress_bar
from src.globals import (
    PREPROCESSING_IN_FILE,
    INPUTDATA_OBJECT,
    ATS_OUT_FILE,
    RANDOM_SEED,
    TARGET_COLUMN,
    DATE_COLS,
)


Before we can make our model, we need to apply pre-pocessing first. This can be done via the inputData class that is dedicated for the given data. Many of the well-known pre-processing functionality is implemented. 

In [22]:


input = InputData(PREPROCESSING_IN_FILE)
input.apply_standard_preprocessing(
    agg_col="rem_time",  # calculated y column
    filter_incompletes=True, #Only complete traces (i.e., that start and finish in the given data set) are included
    dropna=False,
    date_cols=DATE_COLS,  # list of cols that must be transformed into unix time. If empty / not given, nothing will be transformed
)

# # columns that have have ordinal values are label encoded
# input.use_cat_encoding_on("label", ["Category", "Activity"])

# columns with too many categories are deleted
input.use_cat_encoding_on(
    "none",
    [
        "Service Affected",
        "Asset Affected",
        "Asset SubType Affected",
        "Service Caused",
        "Assignment Group",
        "Priority",
        "Asset Type Affected",
        "Category",
        "Status",
        "Closure Code",
        "Asset Caused",
        "Asset Type Caused",
        "Asset SubType Caused",
    ],
)


# split function that keeps traces together
X_train, X_test, y_train, y_test = input.train_test_split_on_trace(
    y_col=TARGET_COLUMN, ratio=0.8,seed=RANDOM_SEED
)

# The previous event attribute must be added in order to navigate through the ATS.
X_test = input.add_prev_events(X_test)


START PREPROCESSING
Filtering out incomplete processes.. [1632 rows have been deleted...]
Converting dates.. 
Adding remaining time attribute..
Encoding columns ['Service Affected', 'Asset Affected', 'Asset SubType Affected', 'Service Caused', 'Assignment Group', 'Priority', 'Asset Type Affected', 'Category', 'Status', 'Closure Code', 'Asset Caused', 'Asset Type Caused', 'Asset SubType Caused'] using none encoding..
Adding previous events attribute..


Let's build our model with the HistGradientBoostingRegressor.

In [23]:
ats_hgb = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=HistGradientBoostingRegressor()
)

ats_hgb.fit(X_train, y_train)
ats_hgb.finalize()
ats_hgb.save(ATS_OUT_FILE+'_hgb')

START CREATING ATS
Create: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete


Finalize: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete




Let's build our model with the Median model.

In [24]:
ats_median = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=Median()
)

ats_median.fit(X_train, y_train)
ats_median.finalize()
ats_median.save(ATS_OUT_FILE+'_median')

START CREATING ATS
Create: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete


Finalize: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete




Let's build our model with the Mean model.

In [25]:
ats_average = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=Average()
)

ats_average.fit(X_train, y_train)
ats_average.finalize()
ats_average.save(ATS_OUT_FILE+'_average')

START CREATING ATS
Create: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXX---------------------| 59.5% Complete

Let's build our model with the Mode model.

In [None]:
ats_mode = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=Mode()
)

ats_mode.fit(X_train, y_train)
ats_mode.finalize()
ats_mode.save(ATS_OUT_FILE+'_mode')

Let's build our model with the SVR.

In [None]:
ats_svm = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=SVR()
)

ats_svm.fit(X_train, y_train)
ats_svm.finalize()
ats_svm.save(ATS_OUT_FILE+'_svm')

Let's build our model with the Linear Regression model.

In [None]:
ats_linreg = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col=TARGET_COLUMN,
    representation="multiset",
    horizon=1,
    model=LinearRegression()
)

ats_linreg.fit(X_train, y_train)
ats_linreg.finalize()
ats_linreg.save(ATS_OUT_FILE+'_linreg')


We can now test the accuracy of our models!

In [None]:
print_progress_bar(
    0, len(y_test), prefix="Prediction:", suffix="Complete", length=50
)

y_preds_average = []
y_preds_median = []
y_preds_mode = []
y_preds_hgb = []
y_preds_linreg = []
y_preds_svm = []

for i, event in enumerate(X_test.to_dict(orient="records")):

    y_preds_average.append(ats_average.predict(event))
    y_preds_median.append(ats_median.predict(event))
    y_preds_mode.append(ats_mode.predict(event))
    y_preds_hgb.append(ats_hgb.predict(event))
    y_preds_linreg.append(ats_linreg.predict(event))
    y_preds_svm.append(ats_svm.predict(event))
    print_progress_bar(
        i+1, len(y_test), prefix="Prediction:", suffix="Complete", length=50
    )




Prediction: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete


In [18]:
def get_mae_mse(y_test,y_preds)->float:
    mae = 0.0
    mse = 0.0

    for y_pred, y in y_preds, y_test:
        mae += abs(y_pred - y)
        mse += (y_pred - y)**2
    
    n = len(y_test)
    mae = mae / n
    mse = mse / n

    return mae, mse

mae_average, mse_average = get_mae_mse(y_test.tolist(),y_preds_average.tolist())

print(f"MAE: {round(mae_average/ (60*60))} hours  = {round(mae_average / (60*60*24))} days") #get difference in hours instead of seconds
print(f"MSE: {int(mse_average/ (60*60))} hours  = {int(mse_average / (60*60*24))} days") #get difference in hours instead of seconds


ValueError: not enough values to unpack (expected 2, got 0)