First, we need to import all necessary components.

In [2]:
# External imports
import pickle
import pandas as pd
from sklearn.linear_model import (
    LinearRegression,  # linear, no outliers in data, no correlation between features
    TheilSenRegressor,  # linear, no outliers in data, correlation between features
    Lars,  # linear, outliers in data, speed important, more features than samples, MAE: 325
    ARDRegression,  # linear, outliers in data, speed important, few important features,
    SGDRegressor,  # linear, outliers in data, speed important, large dataset
    BayesianRidge,  # linear, outliers in data, speed important, not especially large dataset
)
from sklearn.neighbors import (
    KNeighborsRegressor,  # MAE: 46
    RadiusNeighborsRegressor,
)  # nonlinear, many features, few important features, sample/feature ratio: high sample
from sklearn.svm import (
    SVR,  # nonlinear, many features, few important features, sample/feature ratio: high feature (TAKES VEEERY LONG)
)
from sklearn.ensemble import (
    HistGradientBoostingRegressor,  # nonlinear, <10 features, no noise/outliers, >10000 samples, robust against missing values
)

# Internal imports
from src.ats import *
from src.print_ats import *
from src.input_data import InputData
from src.custom_models import Mean, Minimum, Maximum, SampleMean, Median, Mode
from src.metrics import get_mae_rmse
from src.helper import print_progress_bar
from src.globals import (
    PREPROCESSING_IN_FILE,
    INPUTDATA_OBJECT,
    ATS_OUT_FILE,
    BASE_ATS_OUT_FILE,
    RANDOM_SEED,
    TARGET_COLUMN,
    DATE_COLS,
)

Before we can make our model, we need to apply pre-pocessing first. This can be done via the inputData class that is dedicated for the given data. Many of the well-known pre-processing functionality is implemented.

In [2]:
input = InputData(PREPROCESSING_IN_FILE)
input.apply_standard_preprocessing(
    agg_col="rem_time",  # calculated y column
    filter_incompletes=True,
    date_cols="auto"
    # date_cols=DATE_COLS,  # when list passed: those cols will be transformed, when empty: nothing will be transformed, when 'auto' passed: will automatically detect date cols and transform them
)

# columns that have <20 unique values are one-hot encoded
input.use_cat_encoding_on(
    "ohe", ["Asset Type Affected", "Status", "Closure Code"]
)

# columns that have have ordinal values are label encoded
input.use_cat_encoding_on("label", ["Category", "Priority"])

# columns with too many categories are deleted
input.use_cat_encoding_on(
    "none",
    [
        "Service Affected",
        "Asset Affected",
        "Asset SubType Affected",
        "Service Caused",
        "Assignment Group",
        "Asset Caused",
        "Asset Type Caused",
        "Asset SubType Caused",
    ],
)

# drop missing values, as most models don't accept this
# we drop per column, as then only 4 will be lost
input.dropna(axis=1)

input.save_df(
    n_rows=20
)  # save function with new "n_rows" feature that ensures opening in vscode

input.save(INPUTDATA_OBJECT)

# split function that keeps traces together
X_train, X_test, y_train, y_test = input.train_test_split_on_trace(
y_col=TARGET_COLUMN, ratio=0.8, seed=RANDOM_SEED
)
print(X_train.shape)
print(X_test.shape)

X_test = input.add_prev_events(X_test)



START PREPROCESSING
Filtering out incomplete processes.. [1632/303819 rows deleted]
Converting dates.. 
Adding remaining time attribute..
Encoding columns ['Asset Type Affected', 'Status', 'Closure Code'] using ohe encoding..
Encoding columns ['Category', 'Priority'] using label encoding..
Encoding columns ['Service Affected', 'Asset Affected', 'Asset SubType Affected', 'Service Caused', 'Assignment Group', 'Asset Caused', 'Asset Type Caused', 'Asset SubType Caused'] using none encoding..
Dropping missing values..  [4/45 columns deleted]
	 Deleted: ['Resolved Time', 'Handle Time (Hours)', 'Number of Reassignments', 'Reopen Time']
Saving df..
(242006, 40)
(60181, 40)
Adding previous events attribute..


Let's build our transition system

In [4]:
ats = ATS(
    trace_id_col="Incident ID",
    act_col="Activity",
    y_col="RemainingTime",
    representation="multiset",
    horizon=1,
    # model=SVR(),
    seed=RANDOM_SEED,
)

ats.fit(X_train, y_train)
ats.save("results/" + BASE_ATS_OUT_FILE)

START CREATING ATS
Fit: |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| 100.0% Complete




FileNotFoundError: [Errno 2] No such file or directory: 'data/results/base_ats.pkl'

Let's create several ATSs by finalizing them with simple models.

In [4]:
with open(f"data/results/{BASE_ATS_OUT_FILE}.pkl", "rb") as file:
    ats = pickle.load(file)

def generate_multiple_ats_models(ats:ATS,models:list) -> list[ATS]:
    from copy import deepcopy
    ats_list = []
    for model in models:
        ts = deepcopy(ats)
        ats_list.append(ts)
    for i, ats_item in enumerate(ats_list):
        ats_item.finalize(model=models[i])
        ats_item.save(f"results/{ATS_OUT_FILE}_{ats_item.model}")
    return ats_list

model_list = [Mean(), Median(), Mode()]
ats_list = generate_multiple_ats_models(ats,model_list)

KeyboardInterrupt: 

Let's create several ATSs by finalizing them with advanced models.

In [None]:
with open(f"data/results/{BASE_ATS_OUT_FILE}.pkl", "rb") as file:
    ats = pickle.load(file)

model_list = [HistGradientBoostingRegressor(), LinearRegression(), KNeighborsRegressor()]
ats_list = generate_multiple_ats_models(ats,model_list)

We can now test the accuracy of our models!

In [29]:
print_progress_bar(
    0, len(y_test), prefix="Prediction:", suffix="Complete", length=50
)

y_preds_average = []
y_preds_median = []
y_preds_mode = []
y_preds_hgb = []
y_preds_linreg = []
y_preds_svm = []
y_preds=[]

for i, event in enumerate(X_test.to_dict(orient="records")):

    for ats_model in ats_list:
        y_preds.append([ats_model.predict(event)])

    y_preds_average.append(ats_average.predict(event))
    y_preds_median.append(ats_median.predict(event))
    y_preds_mode.append(ats_mode.predict(event))
    y_preds_hgb.append(ats_hgb.predict(event))
    y_preds_linreg.append(ats_linreg.predict(event))
    y_preds_svm.append(ats_svm.predict(event))
    print_progress_bar(
        i+1, len(y_test), prefix="Prediction:", suffix="Complete", length=50
    )




Prediction: |--------------------------------------------------| 0.0% Complete

AttributeError: 'list' object has no attribute 'columns'

In [18]:
mae_average, rmse_average, r2 = get_mae_rmse(y_test.tolist(),y_preds_average.tolist())

print(f'R^2: {r2}')
print(f"MAE: {round(mae_average/ (60*60))} hours  = {round(mae_average / (60*60*24))} days") #get difference in hours instead of seconds
print(f"RMSE: {int(rmse_average/ (60*60))} hours  = {int(rmse_average / (60*60*24))} days") #get difference in hours instead of seconds


ValueError: not enough values to unpack (expected 2, got 0)