In [12]:
import time
import pandas as pd
import numpy as np
from os.path import exists
from datetime import datetime
from sklearn.metrics import classification_report
from sktime.datatypes._panel._convert import (
    from_multi_index_to_nested,
)
entry_directory = "Raw"
prepared_directory = "Prepared"
organised_directory = "Organised"
ml_directory = "MachineLearning"
seed = 42

def remap_labels(label):
    if label == "aggregation":
        return 0
    elif label == "filtration":
        return 1
    else:
        return 2

def grow_snapshots(snapshot, label, snapshots):
    if label == "aggregation":
        return snapshot
    elif label == "filtration":
        return snapshot + snapshots
    else:
        return snapshot + 2*snapshots

def read_dataset_TSLearn(udf_types, organised_directory, num_of_samples = 0, include_RAM = False, file_type = "joined"):
    from tslearn.utils import to_time_series_dataset, to_time_series

    full_df = pd.DataFrame()
    for udf_type in udf_types:
        full_df = pd.concat([full_df, pd.read_csv(f"{organised_directory}/{udf_type}/{file_type}_{udf_type}.csv")])
    if num_of_samples > 0:
        full_df = full_df[full_df.snapshot < num_of_samples]
    
    if include_RAM:
        full_df["timeseries"] = full_df.apply(lambda x: np.array([x.CPU, x.RAM]), axis=1)
        full_df = full_df[full_df.snapshot < 200].groupby(["label", "snapshot"]).timeseries.apply(list).reset_index()
        ts_series = full_df.timeseries.apply(to_time_series).to_numpy()
    else:
        full_df = full_df.groupby(["label", "snapshot"]).CPU.apply(np.array).reset_index()
        ts_series = full_df.CPU.to_numpy()
    ts_labels_str = full_df.label.to_numpy()
    ts_labels = full_df.label.apply(remap_labels).to_numpy()
    ts_series = to_time_series_dataset(ts_series)

    return ts_labels, ts_series, ts_labels_str


def read_dataset_sktime(udf_types, organised_directory, num_of_samples = 0, include_RAM = False, file_type = "joined"):

    full_df = pd.DataFrame()
    for udf_type in udf_types:
        full_df = pd.concat([full_df, pd.read_csv(f"{organised_directory}/{udf_type}/joined_{udf_type}.csv")])
    if num_of_samples > 0:
        full_df = full_df[full_df.snapshot < num_of_samples]

    ts_labels = full_df[full_df.epoch == 0.0].label.apply(remap_labels).to_numpy()
    full_df["snapshot"] = full_df.apply(lambda x: grow_snapshots(x.snapshot, x.label, num_of_samples), axis=1)
    df = full_df.set_index(["snapshot", full_df.groupby("snapshot").cumcount()])
    index = pd.MultiIndex.from_product(df.index.levels, names=df.index.names)
    output = df.reindex(index, fill_value=0).reset_index(level=1, drop=True).reset_index()
    output["row_number"] = output.groupby("snapshot").cumcount()
    if include_RAM:
        output = output[["snapshot", "row_number", "CPU", "RAM"]].set_index(["snapshot", "row_number"])
    else:
        output = output[["snapshot", "row_number", "CPU"]].set_index(["snapshot", "row_number"])

    return ts_labels, output

def read_dataset_sktime(udf_types, organised_directory, num_of_samples = 0, include_RAM = False, file_type = "joined"):

    full_df = pd.DataFrame()
    for udf_type in udf_types:
        full_df = pd.concat([full_df, pd.read_csv(f"{organised_directory}/{udf_type}/joined_{udf_type}.csv")])
    if num_of_samples > 0:
        full_df = full_df[full_df.snapshot < num_of_samples]

    ts_labels = full_df[full_df.epoch == 0.0].label.apply(remap_labels).to_numpy()
    full_df["snapshot"] = full_df.apply(lambda x: grow_snapshots(x.snapshot, x.label, num_of_samples), axis=1)
    full_df["row_number"] = full_df.groupby("snapshot").cumcount()
    if include_RAM:
        full_df = full_df[["snapshot", "row_number", "CPU", "RAM"]].set_index(["snapshot", "row_number"])
    else:
        full_df = full_df[["snapshot", "row_number", "CPU"]].set_index(["snapshot", "row_number"])
    return ts_labels, from_multi_index_to_nested(full_df)

def read_dataset_sktime_zeros(udf_types, organised_directory, num_of_samples = 0, include_RAM = False, file_type = "joined"):

    full_df = pd.DataFrame()
    for udf_type in udf_types:
        full_df = pd.concat([full_df, pd.read_csv(f"{organised_directory}/{udf_type}/joined_{udf_type}.csv")])
    if num_of_samples > 0:
        full_df = full_df[full_df.snapshot < num_of_samples]

    ts_y = full_df[full_df.epoch == 0.0].label.apply(remap_labels).to_numpy()
    full_df["snapshot"] = full_df.apply(lambda x: grow_snapshots(x.snapshot, x.label, num_of_samples), axis=1)
    df = full_df.set_index(["snapshot", full_df.groupby("snapshot").cumcount()])
    index = pd.MultiIndex.from_product(df.index.levels, names=df.index.names)
    output = df.reindex(index, fill_value=0).reset_index(level=1, drop=True).reset_index()
    output["row_number"] = output.groupby("snapshot").cumcount()
    if include_RAM:
        ts_x = output[["snapshot", "row_number", "CPU", "RAM"]].set_index(["snapshot", "row_number"])
    else:
        ts_x = output[["snapshot", "row_number", "CPU"]].set_index(["snapshot", "row_number"])

    return ts_y, from_multi_index_to_nested(ts_x)

def wrap_classification_timer(clf, X_train, y_train, x_test, y_test):
    start = time.time()
    print(f"start time: {datetime.fromtimestamp(start)}")
    clf.fit(X_train, y_train)
    print(classification_report(y_test, clf.predict(x_test)))
    end = time.time()
    print(f"end time: {datetime.fromtimestamp(end)}")
    time_elapsed = (end - start)/60
    print(f"time elapsed: {time_elapsed} minutes.")

    return clf


# TSLearn experiments

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
udf_types = ['aggregation', 'filtration', 'filtration-aggregation']
number_of_samples = 300

joined_ts_labels, joined_ts_series, joined_ts_labels_str = read_dataset_TSLearn(udf_types, organised_directory, number_of_samples, True, "joined")
normalized_ts_labels, normalized_ts_series, normalized_ts_labels_str = read_dataset_TSLearn(udf_types, organised_directory, number_of_samples, True, "normalized")
smooth_ts_labels, smooth_ts_series, smooth_ts_labels_str = read_dataset_TSLearn(udf_types, organised_directory, number_of_samples, True, "12_normalized_smooth")

joined_X_train, joined_X_test, joined_y_train, joined_y_test = train_test_split(joined_ts_series, joined_ts_labels, test_size=0.33, random_state=seed)
normalized_X_train, normalized_X_test, normalized_y_train, normalized_y_test = train_test_split(normalized_ts_series, normalized_ts_labels, test_size=0.33, random_state=seed)
smooth_X_train, smooth_X_test, smooth_y_train, smooth_y_test = train_test_split(smooth_ts_series, smooth_ts_labels, test_size=0.33, random_state=seed)


## KNN

In [34]:

from tslearn.neighbors import KNeighborsTimeSeriesClassifier

# joined dataset
path_to_model = f"./{ml_directory}/knn_trained_joined.hdf5"
if exists(path_to_model):
    knn_joined = KNeighborsTimeSeriesClassifier.from_hdf5(path_to_model)
    print(classification_report(joined_y_test, knn_joined.predict(joined_X_test)))
else:
    knn_joined = wrap_classification_timer(KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", n_jobs=4), joined_X_train, joined_y_train, joined_X_test, joined_y_test)
    knn_joined.to_hdf5(path_to_model)

start time: 2022-04-30 13:45:37.624882
              precision    recall  f1-score   support

           0       0.99      0.94      0.96        71
           1       0.98      1.00      0.99        61
           2       0.94      0.97      0.96        66

    accuracy                           0.97       198
   macro avg       0.97      0.97      0.97       198
weighted avg       0.97      0.97      0.97       198

end time: 2022-04-30 14:01:01.625050
time elapsed: 15.400002797444662 minutes.


In [35]:
# normalized dataset
path_to_model = f"./{ml_directory}/knn_trained_normalized.hdf5"
if exists(path_to_model):
    knn_normalized = KNeighborsTimeSeriesClassifier.from_hdf5(path_to_model)
    print(classification_report(normalized_y_test, knn_normalized.predict(normalized_X_test)))
else:
    knn_normalized = wrap_classification_timer(KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", n_jobs=4), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)
    knn_normalized.to_hdf5(path_to_model)


start time: 2022-04-30 14:01:01.741091
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        71
           1       0.98      1.00      0.99        61
           2       0.95      0.92      0.94        66

    accuracy                           0.96       198
   macro avg       0.96      0.96      0.96       198
weighted avg       0.96      0.96      0.96       198

end time: 2022-04-30 14:16:18.656087
time elapsed: 15.281916602452595 minutes.


In [36]:
# smooth dataset
path_to_model = f"./{ml_directory}/knn_trained_smooth.hdf5"
if exists(path_to_model):
    knn_smooth = KNeighborsTimeSeriesClassifier.from_hdf5(path_to_model)
    print(classification_report(smooth_y_test, knn_smooth.predict(smooth_X_test)))
else:
    knn_smooth = wrap_classification_timer(KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", n_jobs=4), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)
    knn_smooth.to_hdf5(path_to_model)


start time: 2022-04-30 14:16:18.745089
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        71
           1       0.98      1.00      0.99        61
           2       0.92      0.86      0.89        66

    accuracy                           0.93       198
   macro avg       0.93      0.93      0.93       198
weighted avg       0.93      0.93      0.93       198

end time: 2022-04-30 14:31:28.770088
time elapsed: 15.167083323001862 minutes.


## Shapelets

In [44]:
from tslearn.shapelets import LearningShapelets, grabocka_params_to_shapelet_size_dict
from tensorflow import optimizers as opt

path_to_model = f"./{ml_directory}/shp_trained_joined.hdf5"
if exists(path_to_model):
    shp_joined = LearningShapelets.from_hdf5(path_to_model)
    print(classification_report(joined_y_test, shp_joined.predict(joined_X_test)))
else:
    n_ts, ts_sz = joined_X_train.shape[:2]
    n_classes = len(set(joined_y_train))
    shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts,
                                                        ts_sz=ts_sz,
                                                        n_classes=n_classes,
                                                        l=0.06,
                                                        r=1)
    shp_joined = wrap_classification_timer(LearningShapelets(n_shapelets_per_size=shapelet_sizes,
                            optimizer=opt.Adam(.01),
                            batch_size=16,
                            weight_regularizer=.01,
                            max_iter=200,
                            random_state=seed,
                            verbose=0), joined_X_train, joined_y_train, joined_X_test, joined_y_test)





start time: 2022-04-30 14:41:35.713545
              precision    recall  f1-score   support

           0       0.36      1.00      0.53        71
           1       0.00      0.00      0.00        61
           2       0.00      0.00      0.00        66

    accuracy                           0.36       198
   macro avg       0.12      0.33      0.18       198
weighted avg       0.13      0.36      0.19       198

end time: 2022-04-30 14:47:08.344848
time elapsed: 5.543855047225952 minutes.




In [45]:
path_to_model = f"./{ml_directory}/shp_trained_normalized.hdf5"
if exists(path_to_model):
    shp_normalized = LearningShapelets.from_hdf5(path_to_model)
    print(classification_report(normalized_y_test, shp_normalized.predict(normalized_X_test)))
else:
    n_ts, ts_sz = normalized_X_train.shape[:2]
    n_classes = len(set(normalized_y_train))
    shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts,
                                                        ts_sz=ts_sz,
                                                        n_classes=n_classes,
                                                        l=0.06,
                                                        r=1)
    shp_normalized = wrap_classification_timer(LearningShapelets(n_shapelets_per_size=shapelet_sizes,
                            optimizer=opt.Adam(.01),
                            batch_size=16,
                            weight_regularizer=.01,
                            max_iter=200,
                            random_state=seed,
                            verbose=0), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)
    # shp_normalized.to_hdf5(path_to_model)




start time: 2022-04-30 14:47:08.467856
              precision    recall  f1-score   support

           0       0.36      1.00      0.53        71
           1       0.00      0.00      0.00        61
           2       0.00      0.00      0.00        66

    accuracy                           0.36       198
   macro avg       0.12      0.33      0.18       198
weighted avg       0.13      0.36      0.19       198

end time: 2022-04-30 14:53:23.806428
time elapsed: 6.2556428670883175 minutes.




In [46]:
path_to_model = f"./{ml_directory}/shp_trained_smooth.hdf5"
if exists(path_to_model):
    shp_smooth = LearningShapelets.from_hdf5(path_to_model)
    print(classification_report(smooth_y_test, shp_smooth.predict(smooth_X_test)))
else:
    n_ts, ts_sz = smooth_X_train.shape[:2]
    n_classes = len(set(smooth_y_train))
    shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts,
                                                        ts_sz=ts_sz,
                                                        n_classes=n_classes,
                                                        l=0.06,
                                                        r=1)
    shp_smooth = wrap_classification_timer(LearningShapelets(n_shapelets_per_size=shapelet_sizes,
                            optimizer=opt.Adam(.01),
                            batch_size=16,
                            weight_regularizer=.01,
                            max_iter=200,
                            random_state=seed,
                            verbose=0), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)
    # shp_smooth.to_hdf5(path_to_model)




start time: 2022-04-30 14:53:23.887428
              precision    recall  f1-score   support

           0       0.36      1.00      0.53        71
           1       0.00      0.00      0.00        61
           2       0.00      0.00      0.00        66

    accuracy                           0.36       198
   macro avg       0.12      0.33      0.18       198
weighted avg       0.13      0.36      0.19       198

end time: 2022-04-30 14:59:10.185378
time elapsed: 5.77163249651591 minutes.




## KMeans

In [47]:
from tslearn.clustering import TimeSeriesKMeans

# joined dataset
path_to_model = f"./{ml_directory}/km_trained_joined.hdf5"
if exists(path_to_model):
    km_joined = TimeSeriesKMeans.from_hdf5(path_to_model)
    print(classification_report(joined_y_test, km_joined.predict(joined_X_test)))
else:
    km_joined = wrap_classification_timer(TimeSeriesKMeans(n_clusters=3, metric="dtw", n_jobs=4, random_state = seed, max_iter_barycenter=5), joined_X_train, joined_y_train, joined_X_test, joined_y_test)
    km_joined.to_hdf5(path_to_model)


start time: 2022-04-30 14:59:10.283377
              precision    recall  f1-score   support

           0       1.00      0.01      0.03        71
           1       1.00      0.08      0.15        61
           2       0.34      1.00      0.51        66

    accuracy                           0.36       198
   macro avg       0.78      0.37      0.23       198
weighted avg       0.78      0.36      0.23       198

end time: 2022-04-30 15:21:10.238334
time elapsed: 21.99924928744634 minutes.


In [8]:
from tslearn.clustering import TimeSeriesKMeans
# normalized dataset
path_to_model = f"./{ml_directory}/km_trained_normalized.hdf5"
if exists(path_to_model):
    km_normalized = TimeSeriesKMeans.from_hdf5(path_to_model)
    print(classification_report(normalized_y_test, km_normalized.predict(normalized_X_test)))
else:
    km_normalized = wrap_classification_timer(TimeSeriesKMeans(n_clusters=3, metric="dtw", n_jobs=4, random_state = seed, max_iter_barycenter=5), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)
    km_normalized.to_hdf5(path_to_model)


              precision    recall  f1-score   support

           0       0.43      0.31      0.36        71
           1       0.53      0.13      0.21        61
           2       0.44      0.88      0.59        66

    accuracy                           0.44       198
   macro avg       0.47      0.44      0.39       198
weighted avg       0.47      0.44      0.39       198



In [None]:
# smooth dataset
path_to_model = f"./{ml_directory}/km_trained_smooth.hdf5"
if exists(path_to_model):
    km_smooth = TimeSeriesKMeans.from_hdf5(path_to_model)
    print(classification_report(smooth_y_test, km_smooth.predict(smooth_X_test)))
else:
    km_smooth = wrap_classification_timer(TimeSeriesKMeans(n_clusters=3, metric="dtw", n_jobs=4, random_state = seed, max_iter_barycenter=5), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)
    km_smooth.to_hdf5(path_to_model)

# SKLearn experiments

In [3]:
from sklearn.model_selection import train_test_split

udf_types = ['aggregation', 'filtration', 'filtration-aggregation']
number_of_samples = 300
joined_sk_labels, joined_sk_series = read_dataset_sktime_zeros(udf_types, organised_directory, number_of_samples, True, "joined")
normalized_sk_labels, normalized_sk_series = read_dataset_sktime_zeros(udf_types, organised_directory, number_of_samples, True, "normalized")
smooth_sk_labels, smooth_sk_series = read_dataset_sktime_zeros(udf_types, organised_directory, number_of_samples, True, "12_normalized_smooth")

joined_X_train, joined_X_test, joined_y_train, joined_y_test = train_test_split(joined_sk_series, joined_sk_labels, test_size=0.33, random_state=seed)
normalized_X_train, normalized_X_test, normalized_y_train, normalized_y_test = train_test_split(normalized_sk_series, normalized_sk_labels, test_size=0.33, random_state=seed)
smooth_X_train, smooth_X_test, smooth_y_train, smooth_y_test = train_test_split(smooth_sk_series, smooth_sk_labels, test_size=0.33, random_state=seed)

## Rocket

In [11]:
from sktime.classification.kernel_based import RocketClassifier

rocket = wrap_classification_timer(RocketClassifier(rocket_transform="multirocket"), joined_X_train, joined_y_train, joined_X_test, joined_y_test)

start time: 2022-05-03 12:42:42.650865
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        94
           1       1.00      1.00      1.00       117
           2       0.98      0.98      0.98        86

    accuracy                           0.99       297
   macro avg       0.99      0.99      0.99       297
weighted avg       0.99      0.99      0.99       297

end time: 2022-05-03 12:43:58.198869
time elapsed: 1.2591334104537963 minutes.


In [5]:
rocket = wrap_classification_timer(RocketClassifier(rocket_transform="multirocket"), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)

start time: 2022-05-03 12:13:14.146583
              precision    recall  f1-score   support

           0       0.98      0.99      0.98        94
           1       1.00      1.00      1.00       117
           2       0.99      0.98      0.98        86

    accuracy                           0.99       297
   macro avg       0.99      0.99      0.99       297
weighted avg       0.99      0.99      0.99       297

end time: 2022-05-03 12:14:51.254236
time elapsed: 1.6184608777364096 minutes.


In [6]:
rocket = wrap_classification_timer(RocketClassifier(rocket_transform="multirocket"), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)

start time: 2022-05-03 12:14:51.356232
              precision    recall  f1-score   support

           0       0.98      0.99      0.98        94
           1       1.00      1.00      1.00       117
           2       0.99      0.98      0.98        86

    accuracy                           0.99       297
   macro avg       0.99      0.99      0.99       297
weighted avg       0.99      0.99      0.99       297

end time: 2022-05-03 12:16:21.932329
time elapsed: 1.509601620833079 minutes.


## HIVECOTEV2

In [7]:
from sktime.classification.hybrid import HIVECOTEV2

hive = wrap_classification_timer(HIVECOTEV2(time_limit_in_minutes=2), joined_X_train, joined_y_train, joined_X_test, joined_y_test)

start time: 2022-05-03 12:16:22.258326
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        94
           1       1.00      1.00      1.00       117
           2       0.93      0.97      0.95        86

    accuracy                           0.97       297
   macro avg       0.97      0.97      0.97       297
weighted avg       0.97      0.97      0.97       297

end time: 2022-05-03 12:24:47.263344
time elapsed: 8.416750299930573 minutes.


In [8]:
hive = wrap_classification_timer(HIVECOTEV2(time_limit_in_minutes=2), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)

start time: 2022-05-03 12:24:47.439320
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        94
           1       1.00      1.00      1.00       117
           2       0.94      0.98      0.96        86

    accuracy                           0.98       297
   macro avg       0.97      0.97      0.97       297
weighted avg       0.98      0.98      0.98       297

end time: 2022-05-03 12:32:58.088857
time elapsed: 8.177492296695709 minutes.


In [9]:
hive = wrap_classification_timer(HIVECOTEV2(time_limit_in_minutes=2), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)

start time: 2022-05-03 12:32:58.231866
              precision    recall  f1-score   support

           0       0.96      0.97      0.96        94
           1       1.00      1.00      1.00       117
           2       0.96      0.95      0.96        86

    accuracy                           0.98       297
   macro avg       0.97      0.97      0.97       297
weighted avg       0.98      0.98      0.98       297

end time: 2022-05-03 12:41:02.038855
time elapsed: 8.063449827829997 minutes.


## DrCIF

In [16]:
from sklearn.pipeline import Pipeline
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.interval_based import DrCIF

steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", DrCIF(n_estimators=10)),
]

cif = wrap_classification_timer(Pipeline(steps), joined_X_train, joined_y_train, joined_X_test, joined_y_test)

start time: 2022-05-03 12:56:14.889016
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        94
           1       1.00      1.00      1.00       117
           2       0.99      0.95      0.97        86

    accuracy                           0.98       297
   macro avg       0.98      0.98      0.98       297
weighted avg       0.98      0.98      0.98       297

end time: 2022-05-03 13:19:53.938897
time elapsed: 23.650831345717112 minutes.


In [17]:
cif = wrap_classification_timer(Pipeline(steps), normalized_X_train, normalized_y_train, normalized_X_test, normalized_y_test)

start time: 2022-05-03 13:21:49.250322
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        94
           1       1.00      1.00      1.00       117
           2       0.97      0.97      0.97        86

    accuracy                           0.98       297
   macro avg       0.98      0.98      0.98       297
weighted avg       0.98      0.98      0.98       297

end time: 2022-05-03 13:40:37.700886
time elapsed: 18.807509406407675 minutes.


In [18]:
cif = wrap_classification_timer(Pipeline(steps), smooth_X_train, smooth_y_train, smooth_X_test, smooth_y_test)

start time: 2022-05-03 13:40:37.800889
              precision    recall  f1-score   support

           0       0.96      0.97      0.96        94
           1       1.00      1.00      1.00       117
           2       0.96      0.95      0.96        86

    accuracy                           0.98       297
   macro avg       0.97      0.97      0.97       297
weighted avg       0.98      0.98      0.98       297

end time: 2022-05-03 13:59:39.859255
time elapsed: 19.03430610895157 minutes.
