In [1]:
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import numpy as np
import pandas as pd
import re
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [2]:
event_attributes = ["ea2", "ea4", "resource_robot", "resource_person", "resource_multi"]
eft_in_file = '../../../data/CS/feature_encodings/baselines/EFT/eft.csv'

In [5]:
eft_dtypes = {f"event_{ea}": int for ea in event_attributes}
events_df = pd.read_csv(eft_in_file, sep=";")  # , dtype=eft_dtypes)

In [6]:
events_df = events_df[events_df["event_ea4"] != 0]
events_df["event_ea4"] = events_df["event_ea4"] - 1
events_df

Unnamed: 0,event_elapsed_time,event_remaining_time,event_synchronization_time,event_previous_type_countkrs,event_ea2,event_ea4,event_resource_robot,event_resource_person,event_resource_multi
3,1.275987,1.800639,-0.011494,0.523933,-0.061526,4.0,1.0,0.0,0.0
8,3.243965,-0.530392,-0.011494,2.889025,-0.061526,3.0,0.0,1.0,0.0
9,3.243965,-0.530392,-0.011494,2.889025,-0.061526,3.0,0.0,1.0,0.0
10,0.729412,2.448046,-0.011494,0.523933,-0.061526,4.0,1.0,0.0,0.0
17,1.911935,1.047372,-0.011494,2.889025,0.232648,4.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2908709,-0.773140,-0.580119,-0.011494,-1.841159,-0.649876,5.0,1.0,0.0,0.0
2908710,-0.773140,-0.580119,-0.011494,-1.841159,-0.944050,2.0,1.0,0.0,0.0
2908711,-0.773140,-0.580119,-0.011494,-1.841159,1.115172,2.0,1.0,0.0,0.0
2911582,-0.773140,-0.580119,-0.011494,-1.841159,0.017845,5.0,0.0,0.0,0.0


In [7]:
# make train test split
X, y = (
    events_df.drop(columns=["event_ea4"]),
    events_df.loc[:, "event_ea4"],
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=0
)

train_data = lgb.Dataset(
    X_train,
    label=y_train,
)
valid_data = lgb.Dataset(
    X_valid,
    label=y_valid,
)

In [8]:
events_df['event_ea4'].value_counts()

5.0    424552
4.0    333297
3.0    288909
0.0     82886
2.0     69132
1.0     52858
Name: event_ea4, dtype: int64

In [10]:
params = {
    "objective": "multiclass",
    "num_class": 6,
    "metric": ["multi_logloss"],
}
bst = lgb.train(
    params,
    train_data,
    num_boost_round=5000,
    valid_sets=[valid_data],
    # callbacks=[lgb.early_stopping(50)],
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 1001307, number of used features: 8
[LightGBM] [Info] Start training from score -2.714766
[LightGBM] [Info] Start training from score -3.162904
[LightGBM] [Info] Start training from score -2.895294
[LightGBM] [Info] Start training from score -1.467692
[LightGBM] [Info] Start training from score -1.322453
[LightGBM] [Info] Start training from score -1.080999
[1]	valid_0's multi_logloss: 1.49475
[2]	valid_0's multi_logloss: 1.46917
[3]	valid_0's multi_logloss: 1.45015
[4]	valid_0's multi_logloss: 1.43497
[5]	valid_0's multi_logloss: 1.42299
[6]	valid_0's multi_logloss: 1.41325
[7]	valid_0's multi_logloss: 1.40503
[8]	valid_0's multi_logloss: 1.39815
[9]	valid_0's multi_logloss: 1.39253
[10]	valid_0's multi_logloss: 1.3876
[11]	valid_0's multi_logloss: 1.38341
[12]	valid_0's mult

In [11]:
y_train_probs = bst.predict(X_train)
y_valid_probs = bst.predict(X_valid)

In [None]:
def get_predictions(lst):
    return max(range(len(lst)), key=lst.__getitem__)


y_train_preds = np.apply_along_axis(get_predictions, axis=1, arr=y_train_probs)
y_valid_preds = np.apply_along_axis(get_predictions, axis=1, arr=y_valid_probs)

In [12]:
train_mse_loss = metrics.accuracy_score(y_train, y_train_preds)
valid_mse_loss = metrics.accuracy_score(y_valid, y_valid_preds)

print(f"Training accuracy: {train_mse_loss}")
print(f"Validation accuracy: {valid_mse_loss}")

# including class '0': 61% accuracy (with highest prior of 0.57 for class '0')
# excluding class '0': 62% accuracy (with highest prior of 0.34 for class '5', which is actually 6)

Training accuracy: 0.6821464346099648
Validation accuracy: 0.6203086363037147


In [13]:
metrics.confusion_matrix(y_valid,y_valid_preds)

array([[ 9669,   139,    96,  2504,  1749,  2422],
       [  273,  4787,    98,  1722,  1408,  2212],
       [  350,   175,  4841,  2897,  2239,  3275],
       [ 1792,   480,   351, 34212,  9635, 11681],
       [ 1574,   492,   395, 10532, 37795, 15680],
       [ 1316,   519,   424,  8392, 10225, 63976]])

In [14]:
events_df['event_ea4'].value_counts(normalize=True).sort_index()

0.0    0.066222
1.0    0.042231
2.0    0.055233
3.0    0.230825
4.0    0.266290
5.0    0.339198
Name: event_ea4, dtype: float64