In [14]:
import random
import numpy as np
import pandas as pd
from t2f.extraction.extractor import feature_extraction
from t2f.utils.importance_old import feature_selection
from t2f.model.clustering import ClusterWrapper
from t2f.data.dataset import read_ucr_datasets
from t2f.selection.selection import cleaning
from sklearn.metrics import davies_bouldin_score

In [2]:
def normalized_DBS(df_feat_all, y_pred, upper_limit=1000):
    dbs = davies_bouldin_score(df_feat_all, y_pred)
    print('dbs:', dbs)
    if dbs > upper_limit:
        dbs = upper_limit
    score = ((dbs - upper_limit)/(0-upper_limit))
    print('DBS Score: ' + str(score))
    return score

### BasicMotions dataset

In [7]:
files=['data/BasicMotions/BasicMotions_TRAIN.txt', 'data/BasicMotions/BasicMotions_TEST.txt']
transform_type='minmax'
model_type='Hierarchical'
batch_size=500
p=4

# Read original dataset
print('Read ucr datasets: ', files)
ts_list, y_true = read_ucr_datasets(paths=files)
n_clusters = len(set(y_true))  # Get number of clusters to find

# Create cluster model
model = ClusterWrapper(n_clusters=n_clusters, model_type=model_type, transform_type=transform_type)
print('Dataset shape: {}, Num of clusters: {}'.format(ts_list.shape, n_clusters))

Read ucr datasets:  ['data/BasicMotions/BasicMotions_TRAIN.txt', 'data/BasicMotions/BasicMotions_TEST.txt']
Dataset shape: (80, 100, 6), Num of clusters: 4


In [8]:
df_all_feats = feature_extraction(ts_list, batch_size, p)
df_all_feats = cleaning(df_all_feats)

Feature Extraction: 100%|██████████| 480/480 [00:16<00:00, 28.51it/s]


In [11]:
y_true

array(['standing', 'standing', 'standing', 'standing', 'standing',
       'standing', 'standing', 'standing', 'standing', 'standing',
       'running', 'running', 'running', 'running', 'running', 'running',
       'running', 'running', 'running', 'running', 'walking', 'walking',
       'walking', 'walking', 'walking', 'walking', 'walking', 'walking',
       'walking', 'walking', 'badminton', 'badminton', 'badminton',
       'badminton', 'badminton', 'badminton', 'badminton', 'badminton',
       'badminton', 'badminton', 'standing', 'standing', 'standing',
       'standing', 'standing', 'standing', 'standing', 'standing',
       'standing', 'standing', 'running', 'running', 'running', 'running',
       'running', 'running', 'running', 'running', 'running', 'running',
       'walking', 'walking', 'walking', 'walking', 'walking', 'walking',
       'walking', 'walking', 'walking', 'walking', 'badminton',
       'badminton', 'badminton', 'badminton', 'badminton', 'badminton',
       'badmin

In [18]:
def generate_random_clustering(y_true):
    return [random.choice(list(set(y_true))) for _ in range(len(y_true))]

In [33]:
davies_bouldin_score(df_all_feats, y_true)

1.5838634359414987

In [34]:
for run in range(10):
    print(f'Run {run}, Score: {davies_bouldin_score(df_all_feats, generate_random_clustering(y_true))}')

Run 0, Score: 12.237839780553248
Run 1, Score: 12.948745818312767
Run 2, Score: 15.00987677152331
Run 3, Score: 15.183736389390841
Run 4, Score: 11.041594190952416
Run 5, Score: 18.333238847791407
Run 6, Score: 9.851369947311088
Run 7, Score: 13.715977348430304
Run 8, Score: 9.993833338780002
Run 9, Score: 16.96738899613105


In [44]:
"""
Strange result!!!
['standing', 'running', 'badminton'] + ['walking', 'walking', ..., 'walking'] has better score than y_true
"""
davies_bouldin_score(df_all_feats, ['standing', 'running', 'badminton'] + ['walking' for _ in range(len(y_true)-3)])

1.0177786554000607

### Random dataset

In [45]:
# 10 multivariate time series with 100 timestamps and 3 signals each
arr = np.random.randn(10, 100, 3)
arr[5:] = arr[5:] * 100

labels = {}  # unsupervised mode
# labels = {0: 'a', 1: 'a', 5: 'b', 6: 'b'}  # semi-supervised mode
n_clusters = 2  # Number of clusters

transform_type = 'std'  # preprocessing step
model_type = 'KMeans'  # clustering model

# Feature extraction
df_feats = feature_extraction(arr, batch_size=100, p=1)

# Feature selection
context = {'model_type': model_type, 'transform_type': transform_type}
top_feats = feature_selection(df_feats, labels=labels, context=context)
df_feats = df_feats[top_feats]

# Clustering
model = ClusterWrapper(n_clusters=n_clusters, model_type=model_type, transform_type=transform_type)
y_pred = model.fit_predict(df_feats)
print(y_pred.shape)

Feature Extraction: 100%|██████████| 30/30 [00:01<00:00, 27.94it/s]


(10,)


In [48]:
y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0], dtype=int32)

In [47]:
davies_bouldin_score(df_feats, y_pred)

0.8327696039617392

In [50]:
for run in range(10):
    print(f'Run {run}, Score: {davies_bouldin_score(df_feats, [random.choice([0, 1]) for _ in range(len(y_pred))])}')

Run 0, Score: 3.7585739114298526
Run 1, Score: 1.317943105557214
Run 2, Score: 1.764421594459349
Run 3, Score: 1.4301841744963895
Run 4, Score: 3.198784541249933
Run 5, Score: 3.387657952044123
Run 6, Score: 3.163525846535205
Run 7, Score: 4.44138652360758
Run 8, Score: 1.7006740806280842
Run 9, Score: 1.4749183171406246


In [53]:
davies_bouldin_score(df_feats, [0] + [1 for _ in range(len(y_pred)-1)])

1.2962572569037427