In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pandas as pd
import math
import numpy as np

Get sorted cases id

In [4]:
import wget
import os.path

if(not os.path.exists('finale.csv')):
    wget.download('https://data.4tu.nl/ndownloader/files/23993303')
    
df_finale = pd.read_csv('finale.csv', index_col=0).reset_index()

df_finale['Complete Timestamp'] = pd.to_datetime(df_finale['Complete Timestamp'])
sorted_time_cases = df_finale.sort_values('Complete Timestamp')['Case ID'].drop_duplicates().values

In [5]:
df_final = pd.read_csv('finale_time_features.csv')

In [6]:
df_final.shape

(21348, 23)

In [7]:
df_final.columns

Index(['Unnamed: 0', 'Case ID', 'Activity', 'Resource', 'Complete Timestamp',
       'Variant', 'Variant index', 'Variant.1', 'seriousness', 'customer',
       'product', 'responsible_section', 'seriousness_2', 'service_level',
       'service_type', 'support_section', 'workgroup',
       'Complete Timestamp Shift', 'Duration', 'Duration Float',
       'Time to conclusion', 'Passed Time', 'Step'],
      dtype='object')

Activity categorization

In [8]:
df_final['Activity Cat'] = df_final['Activity'].astype('category').cat.codes

Example:

In [9]:
df_final[['Activity Cat', 'Step', 'Duration Float', 'Passed Time', 'Time to conclusion']].head(9)

Unnamed: 0,Activity Cat,Step,Duration Float,Passed Time,Time to conclusion
0,0,1,0.0,0.0,31.0087
1,11,2,16.0084,0.0,15.0003
2,9,3,0.0001,15.0002,15.0002
3,1,4,15.0002,15.0003,0.0
4,1,5,0.0,31.0087,0.0
5,0,1,0.0,0.0,30.9822
6,11,2,5.875,25.1053,25.1072
7,9,3,0.0019,25.1072,25.1053
8,1,4,25.1053,30.9822,0.0


One Hot Encoding

In [10]:
one_hot = pd.get_dummies(df_final['Activity Cat'])

# Join the encoded df
df_final = df_final.join(one_hot)

Spliting data (train, test, validation)

In [11]:
from sklearn.model_selection import train_test_split

train_cases, test_cases  = train_test_split(sorted_time_cases, test_size=0.4, 
                                            shuffle=False, 
                                            random_state=42
                                           )

test_cases, val_cases  = train_test_split(test_cases, test_size=0.5, 
                                            shuffle=False, 
                                            random_state=42
                                           )

In [22]:
len(train_cases), len(test_cases), len(val_cases)

(2748, 916, 916)

Dataframes creation

In [18]:
df_train = df_final[df_final['Case ID'].isin(train_cases)]
df_test = df_final[df_final['Case ID'].isin(test_cases)]
df_val = df_final[df_final['Case ID'].isin(val_cases)]

Features selection

In [19]:
X_train = df_train[['Step', 'Duration Float', 'Passed Time', 0,1,2,3,4,5,6,7,8,9,10,11,12,13]]
X_train = X_train.values

y_train = df_train[['Time to conclusion']].values


X_test = df_test[['Step', 'Duration Float', 'Passed Time', 0,1,2,3,4,5,6,7,8,9,10,11,12,13]]
X_test = X_test.values

y_test = df_test[['Time to conclusion']].values


X_val = df_val[['Step', 'Duration Float', 'Passed Time', 0,1,2,3,4,5,6,7,8,9,10,11,12,13]]
X_val = X_val.values

y_val = df_val[['Time to conclusion']].values

Machine learning models definition

In [20]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

regressors = [  
        {'nome' : 'DummyRegressor', 'regressor':  DummyRegressor(strategy='mean')},
        {'nome': 'LinearRegression', 'regressor': LinearRegression(fit_intercept=False, normalize=True)},
        {'nome': 'RandomForestRegressor',  'regressor': RandomForestRegressor(n_estimators=20, criterion='mse', max_features='auto', 
                                                                              bootstrap=True, warm_start=True, max_depth=19, random_state=0)},
]

Experiment running

In [21]:
import time
from sklearn.metrics import mean_absolute_error

cols = ['Nome do modelo', 'MAE', 'MAE (val)']

data_result = []

for regressor in regressors:    
    t1 = time.time()
    regr = regressor['regressor'].fit(X_train, y_train)
    tf = time.time()
    
    y_pred = regr.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    
    y_val_pred = regr.predict(X_val)
    mae_val = mean_absolute_error(y_val, y_val_pred)
    
        
    line = [
        regressor['nome'],
        mae,
        mae_val
    ]
    data_result.append(line)

pd.DataFrame(data_result, columns=cols).sort_values(by='MAE', ascending=True)

Unnamed: 0,Nome do modelo,MAE,MAE (val)
2,RandomForestRegressor,3.009615,4.638135
1,LinearRegression,5.186106,8.097925
0,DummyRegressor,15.012374,14.489836
