In [1]:
import os
os.chdir('..')

# Usage of `TimestampExtractor`

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from skpm.encoding.trace import TraceAggregator
from skpm.event_feature_extraction import TimestampExtractor


def read_log(path):
    log = pd.read_csv(path)
    log = log.rename(
        columns={
            "case:concept:name": "case_id",
            "time:timestamp": "timestamp",
            "concept:name": "activity",
            "org:resource": "resource",
        }
    )
    log = log.loc[:, ["case_id", "activity", "timestamp", "resource"]]
    log["case_id"] = log["case_id"].astype("category")
    return log

train = read_log("data/train.csv")

There as basically three ways of using the method:

- Default initialization
  - it will return all the implemented features
- Extracting one or a few features
  - specifying which features you need 
  - parameter `features`
- Setting output as pandas 
  - it makes it easier mainly for keeping track of `case_id` column

In [3]:
# default
TimestampExtractor().fit_transform(train)

array([[0.00000e+00, 0.00000e+00, 2.84444e+05, 3.42480e+04],
       [5.40000e+01, 5.40000e+01, 2.84390e+05, 3.43020e+04],
       [8.76750e+04, 8.76210e+04, 1.96769e+05, 3.55230e+04],
       ...,
       [3.10000e+01, 3.10000e+01, 6.68060e+04, 4.15800e+04],
       [6.68370e+04, 6.68060e+04, 0.00000e+00, 2.19860e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 3.12940e+04]])

In [4]:
# specify features (str or list[str]) and setting output to pandas
# TODO: when setting the output to pandas, the column names are not correct
TimestampExtractor(features="accumulated_time").set_output(transform="pandas").fit_transform(train)

Unnamed: 0,timestampextractor0
0,0.0
1,54.0
2,87675.0
3,284444.0
4,0.0
...,...
23290,76006.0
23291,0.0
23292,31.0
23293,66837.0


In [5]:
# pandas output and concating to original dataframe
features = ["accumulated_time", "execution_time"]
t = TimestampExtractor(features=features)
train[features] = t.fit_transform(train)
train.head()

Unnamed: 0,case_id,activity,timestamp,resource,accumulated_time,execution_time
0,request for payment 149290,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 09:30:48+00:00,STAFF MEMBER,0.0,0.0
1,request for payment 149290,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 09:31:42+00:00,STAFF MEMBER,54.0,54.0
2,request for payment 149290,Request Payment,2017-01-10 09:52:03+00:00,SYSTEM,87675.0,87621.0
3,request for payment 149290,Payment Handled,2017-01-12 16:31:32+00:00,SYSTEM,284444.0,196769.0
4,request for payment 149216,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 10:41:36+00:00,STAFF MEMBER,0.0,0.0
