In [1]:
# Uncomment below if running in Colab
# !pip install tsfresh
import os
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

# using tsfresh to extract and filter features
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

# for evaluation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preparing data 

In [4]:
# Comment out if running in Colab
# work_dir = os.getcwd()
data_dir ="../../for_students/data_v2"
train_dir = os.path.join(data_dir, 'training')
train_str = os.path.join(train_dir, 'training_{}.csv')

Unnamed: 0,timestamp,kpi_value,anomaly_label
0,2014-04-01 00:00:00,18.622185,0
1,2014-04-01 00:05:00,8.163417,0
2,2014-04-01 00:10:00,13.292383,0
3,2014-04-01 00:15:00,11.667046,0
4,2014-04-01 00:20:00,12.940358,0
...,...,...,...
4027,2014-04-14 23:35:00,10.689582,0
4028,2014-04-14 23:40:00,9.507657,0
4029,2014-04-14 23:45:00,17.339780,0
4030,2014-04-14 23:50:00,14.781273,0


In [None]:
# Uncomment below if running in Colab
# work_dir = os.getcwd()
# train_str = os.path.join(work_dir, 'training_{}.csv')

## Testing on a single dataset
Use `roll_time_series(...., max_timeshift=100, min_timeshift=10)`
https://tsfresh.readthedocs.io/en/v0.7.1/text/rolling.html

In [None]:
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
test_2 = pd.read_csv(train_str.format(2))

# tsfresh needs (id, time info (for sorting), and features) to extract features
test_2["id"] = 1 # This number does not matter in our case. Keep them constant across dataset

test_2_rolled = roll_time_series(test_2, column_id="id", column_sort="timestamp", max_timeshift=100, min_timeshift=10)

print(test_2_rolled.shape)
print(test_2_rolled.head(15))

## Testing on all datasets (extracting toooooooo slow)

In [21]:
# Extracting features from ALL raw dataset do not work! 

# train_df = pd.DataFrame()
# for i in [1,2,3,4,5,100]:
#     train_df_i = pd.read_csv(train_str.format(i))
#     # each training dataset is assigned an id
#     train_df_i["id"] = i
# #     print("dataset {}: {}".format(i, train_df_i.shape))
#     train_df = train_df.append(train_df_i)
# print(train_df.shape)

(54337, 5)


In [25]:
# tsfresh needs (id, time info (for sorting), and features) to extract features
# train_df = train_df.drop("request_count", axis=1)
# train_df

Unnamed: 0,timestamp,kpi_value,anomaly_label,id
0,2014-04-01 00:00:00,18.622185,0,1
1,2014-04-01 00:05:00,8.163417,0,1
2,2014-04-01 00:10:00,13.292383,0,1
3,2014-04-01 00:15:00,11.667046,0,1
4,2014-04-01 00:20:00,12.940358,0,1
...,...,...,...,...
20154,2020-08-28T01:56:00.000+02:00,0.998149,0,100
20155,2020-08-28T01:57:00.000+02:00,0.998340,0,100
20156,2020-08-28T01:58:00.000+02:00,0.998364,0,100
20157,2020-08-28T01:59:00.000+02:00,0.998428,0,100


In [26]:
# y = train_df.pop("anomaly_label")
# X_raw = train_df # preparing dataframe with (id, time, feature(s)) for tsfresh
# print(X_raw)
# print(y)

                           timestamp  kpi_value   id
0                2014-04-01 00:00:00  18.622185    1
1                2014-04-01 00:05:00   8.163417    1
2                2014-04-01 00:10:00  13.292383    1
3                2014-04-01 00:15:00  11.667046    1
4                2014-04-01 00:20:00  12.940358    1
...                              ...        ...  ...
20154  2020-08-28T01:56:00.000+02:00   0.998149  100
20155  2020-08-28T01:57:00.000+02:00   0.998340  100
20156  2020-08-28T01:58:00.000+02:00   0.998364  100
20157  2020-08-28T01:59:00.000+02:00   0.998428  100
20158  2020-08-28T02:00:00.000+02:00   0.997407  100

[54337 rows x 3 columns]
0        0
1        0
2        0
3        0
4        0
        ..
20154    0
20155    0
20156    0
20157    0
20158    0
Name: anomaly_label, Length: 54337, dtype: int64


# Feature extraction

In [None]:
# Using rolling time-series as input with windows ranging from 10-100, see `roll_time_series()`
test_2_x = test_2_rolled.drop("anomaly_label", axis=1)
print(test_2_x.shape)

In [None]:
# https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html#for-the-advanced-how-do-i-set-the-parameters-for-all-kind-of-time-series

extraction_settings = ComprehensiveFCParameters()

X_2 = extract_features(test_2_x, column_id="id", column_sort="timestamp", 
                     default_fc_parameters=extraction_settings, impute_function=impute)
extraction_settings = ComprehensiveFCParameters()

# the return of X_1 has the dimension of raw feature (unrolled) - 10 (the smallest window)
print(X_2.shape)

Feature Extraction:  50%|████████████████████████████████                                | 3/6 [00:14<00:20,  6.80s/it]

In [11]:
X_1.head() # All features extracted

Unnamed: 0,anomaly_label__variance_larger_than_standard_deviation,anomaly_label__has_duplicate_max,anomaly_label__has_duplicate_min,anomaly_label__has_duplicate,anomaly_label__sum_values,anomaly_label__abs_energy,anomaly_label__mean_abs_change,anomaly_label__mean_change,anomaly_label__mean_second_derivative_central,anomaly_label__median,...,kpi_value__fourier_entropy__bins_2,kpi_value__fourier_entropy__bins_3,kpi_value__fourier_entropy__bins_5,kpi_value__fourier_entropy__bins_10,kpi_value__fourier_entropy__bins_100,kpi_value__permutation_entropy__dimension_3__tau_1,kpi_value__permutation_entropy__dimension_4__tau_1,kpi_value__permutation_entropy__dimension_5__tau_1,kpi_value__permutation_entropy__dimension_6__tau_1,kpi_value__permutation_entropy__dimension_7__tau_1
1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.468798,0.772515,1.024933,1.701631,3.691585,1.791293,3.176102,4.77284,6.482125,7.796024


# Feature selection

In [None]:
# Using unrolled y, drop the first 10 rows (as the first 10 rows of feature collapsed as a feature previously)
test_2_y = test_2["anomaly_label"].loc[10:]
print(test_2_y.head(5))
print(test_2_y.shape)
print(type(test_2_y))

In [None]:
X_filtered = select_features(X_2, test_2_y) # features will be selected based on y
print(X_filtered.shape)

In [None]:
X_filtered.head()

# Alternatively "Extract & Filter" in One Step
(*I'm not sure if this works with rolling)

In [None]:
X_filtered_2 = extract_relevant_features(df, y, column_id='id', column_sort='time',
                                         default_fc_parameters=extraction_settings)

In [None]:
(X_filtered.columns == X_filtered_2.columns).all()

# Evaluation

In [None]:
# Evaluate how these features perform in classifiers such as random forest, DTree, KNN ect.
# https://github.com/blue-yonder/tsfresh/tree/main/notebooks/examples