# Functions sanity checks

Playground to test the behavior of the main functions 

In [21]:
import numpy as np
from tqdm import tqdm
import random

from utils.quality_measures import *
from utils.shapelets_transform import *

## Shapelet transform

#### Distance between a shapelet and a time series

In [2]:
def test_calculate_distance(shapelet, time_series):
    # Calculate the distance
    distance = calculate_distance(shapelet, time_series)

    # Print the result
    print(f"Calculated distance between {shapelet} and {time_series} : {distance}")

# Run the test
shapelet = np.array([1, 2, 3], dtype=float)
time_series = np.array([0, 1, 2, 3, 4, 5, 6], dtype=float)
test_calculate_distance(shapelet, time_series)

shapelet = np.array([1, 2, 2], dtype=float)
time_series = np.array([0, 1, 2, 3, 4, 5, 6], dtype=float)
test_calculate_distance(shapelet, time_series)

Calculated distance between [1. 2. 3.] and [0. 1. 2. 3. 4. 5. 6.] : 0.0
Calculated distance between [1. 2. 2.] and [0. 1. 2. 3. 4. 5. 6.] : 0.13874338222577245


#### Set of distances

In [3]:
def test_calculate_distances_for_set():
    # Define a shapelet and a set of time series
    shapelet = np.array([1, 2, 3], dtype=float)
    time_series_set = np.array([
        [0, 1, 2, 3, 4, 5, 6],
        [1, 2, 3, 4, 5, 6, 7],
        [2, 3, 4, 5, 6, 7, 8]
    ], dtype=float)

    # Expected distances
    a = np.array([1.,2.,3.]) / np.linalg.norm([1.,2.,3.])
    b = np.array([2.,3.,4.]) / np.linalg.norm([2.,3.,4.])
    expected_distances = np.array([0.0, 0.0, np.linalg.norm(a - b)])  # The third distance is the Euclidean distance between normalized [1,2,3] and [2,3,4]

    # Calculate the distances
    distances = calculate_distances_for_set(shapelet, time_series_set)

    # Check if the calculated distances match the expected distances
    print("Expected distances:", expected_distances)
    print("Calculated distances:", distances)

# Run the test
test_calculate_distances_for_set()

Expected distances: [0.         0.         0.12179217]
Calculated distances: [0.         0.         0.12179217]


#### Generate candidates

In [4]:
time_series_example = np.array([1, 2, 3, 4, 5, 6])
subsequence_length = 3

candidates = generate_candidates(time_series_example, subsequence_length)
print("Generated candidates:\n", candidates)

Generated candidates:
 [[array([1, 2, 3]), 0], [array([2, 3, 4]), 1], [array([3, 4, 5]), 2], [array([4, 5, 6]), 3]]


#### Remove self-similar

In [5]:
shapelets = [
    # (shapelet, quality, index)
    ([7, 8, 9], 0.98, 6),     # Highest quality, no overlap
    ([1, 2, 3, 4], 0.95, 0),  # High quality, early in the series
    ([11, 12, 13], 0.94, 10), # High quality, no overlap
    ([2, 3, 4, 5], 0.90, 1),  # Overlaps with the second, lower quality
    ([3, 4, 5, 6], 0.85, 2),  # Overlaps with the second and fourth, lowest quality
    ([5, 6], 0.84, 4), 
    ([8, 9, 10], 0.83, 7),    # Overlaps with the first, lower quality
]

non_similar_shapelets = remove_self_similar(shapelets)
print("Non-similar shapelets:", non_similar_shapelets)

Non-similar shapelets: [([7, 8, 9], 0.98, 6), ([1, 2, 3, 4], 0.95, 0), ([11, 12, 13], 0.94, 10), ([5, 6], 0.84, 4)]


#### Merge

In [6]:
k = 2
k_shapelets = np.array([([3, 2, 1], 0.95), ([4, 2, 7], 0.90)], dtype=object)
x_shapelet = np.array([([2, -6, -2], 0.93), ([8, -1, 3], 0.85)], dtype=object)

merged = merge(k, k_shapelets, x_shapelet)
print("Merged shapelets:", merged)

Merged shapelets: [array([list([3, 2, 1]), 0.95], dtype=object), array([list([2, -6, -2]), 0.93], dtype=object)]


#### Shapelet Cached Selection

In [7]:
x_T = np.array([[-7.9, 1.9, 15, 0.1, 0, 0, 1],
       [0, -0.1, -7.1, 2.1, 15.1, -6, 1],
       [-0.1, 4, 5.1, 4, 0, 0, 3],
       [0, 0.2, 0, 4, 5, 4, -2],
       [3.9, 5, 4, 0, 0.3, -5, 5]
])

x_labels = np.array([0, 0, 1, 1, 1], dtype=float)

# Test the function
min_length = 3
max_length = 4
k = 5
selected_shapelets = shapelet_cached_selection(x_T, x_labels, min_length, max_length, k, compute_f_stat)
print("Selected shapelets:", selected_shapelets)
x_shapelets = []
for shap in selected_shapelets:
    x_shapelets.append((shap[0]))

Selected shapelets: [(array([4., 5., 4.]), 30.770944907268657, 3), (array([4. , 5.1, 4. ]), 29.931165214435683, 1), (array([3.9, 5. , 4. ]), 29.89565264226702, 0), (array([-7.1,  2.1, 15.1, -6. ]), 18.496007683660658, 2), (array([-7.9,  1.9, 15. ,  0.1]), 6.686056844285656, 0)]


#### Estimate min/max

In [8]:
# Mock dataset for testing
T = np.array([np.random.rand(20) for _ in range(50)])
x_labels = np.array([np.random.randint(0, 2) for _ in range(50)])

# Test the function
min_length, max_length = estimate_min_and_max(T, x_labels, compute_ig)
print("Estimated Min Length:", min_length)
print("Estimated Max Length:", max_length)

100%|██████████| 10/10 [00:04<00:00,  2.01it/s]

Estimated Min Length: 6
Estimated Max Length: 9





#### Distance between two shapelets

In [9]:
print(f"Similar shapelets ({x_shapelets[0]} and {x_shapelets[1]} :", dS(x_shapelets[0], x_shapelets[1]))
print(f"Dissimilar shapelets ({x_shapelets[0]} and {x_shapelets[3]}):" , dS(x_shapelets[0], x_shapelets[3]))

Similar shapelets ([4. 5. 4.] and [4.  5.1 4. ] : 0.009837650343284144
Dissimilar shapelets ([4. 5. 4.] and [-7.1  2.1 15.1 -6. ]): 1.0156143485330422


#### Clustering

In [10]:
shapelets = [[1., 2.], [2., 4.], [3., 6.], [1., -1.], [2., -2.], [3., -3.]]
noClusters = 2
clusters = cluster_shapelets(shapelets, noClusters)
print("Clusters:", clusters)

Clusters: [{(2.0, 4.0), (1.0, 2.0), (3.0, 6.0)}, {(2.0, -2.0), (3.0, -3.0), (1.0, -1.0)}]


In [11]:
## For our synthetic dataset
clusters = cluster_shapelets(x_shapelets, noClusters)
print("Clusters:", clusters)

Clusters: [{(4.0, 5.1, 4.0), (3.9, 5.0, 4.0), (4.0, 5.0, 4.0)}, {(-7.9, 1.9, 15.0, 0.1), (-7.1, 2.1, 15.1, -6.0)}]


## HAC clustering

In [12]:
# Example usage
shapelets = np.array([[1., 2.], [1., -1.], [2., 4.], [3., 6.], [2., -2.], [3., -3.]])
noClusters = 2
clusters = hac_cluster_shapelets(shapelets, noClusters)
print("Clusters:", clusters)

Clusters: [{(2.0, -2.0), (3.0, -3.0), (1.0, -1.0)}, {(2.0, 4.0), (1.0, 2.0), (3.0, 6.0)}]


In [13]:
## For our synthetic dataset
clusters = hac_cluster_shapelets(x_shapelets, noClusters)
print("Clusters:", clusters)

Clusters: [{(-7.9, 1.9, 15.0, 0.1), (-7.1, 2.1, 15.1, -6.0)}, {(4.0, 5.1, 4.0), (3.9, 5.0, 4.0), (4.0, 5.0, 4.0)}]


#### Shapelets transform -- without clusters

In [14]:
print(shapelets_transform(x_shapelets, x_T))

[[0.73880678 0.72972586 0.73416926 0.34976991 0.        ]
 [1.01561435 1.00783884 1.01395734 0.         0.34976991]
 [0.00983765 0.         0.0100921  1.04236881 0.82902668]
 [0.         0.00983765 0.0113124  1.05419707 0.84329815]
 [0.0113124  0.0100921  0.         1.16380968 1.18110255]]


#### Shapelets transform -- with clusters

In [15]:
## For our synthetic dataset
clusters = cluster_shapelets(x_shapelets, noClusters)

list_cluster = []
for c in clusters:
    list_cluster.append(list(c))

shapelets_cluster_transform(list_cluster, x_T)

array([[0.73423397, 0.17488495],
       [1.01247017, 0.17488495],
       [0.00664325, 0.93569774],
       [0.00705002, 0.94874761],
       [0.00713483, 1.17245612]])

## Quality measures

#### F-stat

In [16]:
def test_compute_f_stat():
    # Create two simple datasets
    x_distance_1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    x_labels_1 = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    
    x_distance_2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    x_labels_2 = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

    # Compute F-statistic
    f_stat_1 = compute_f_stat(x_distance_1, x_labels_1)
    f_stat_2 = compute_f_stat(x_distance_2, x_labels_2)

    # Print the result
    print("Computed F-statistic for the first dataset:", f_stat_1)
    print("Computed F-statistic for the second dataset:", f_stat_2)
    print("We expect the second one to be lower than the first one")

# Run the test
test_compute_f_stat()

Computed F-statistic for the first dataset: 5.0
Computed F-statistic for the second dataset: 0.05
We expect the second one to be lower than the first one


#### Entropy

In [17]:
## Example
x_labels_1 = np.array([0, 0, 0, 0, 0])
x_labels_2 = np.array([0, 1, 3, 2, 0])
x_labels_3 = np.array([0, 1, 2, 3, 4])
print(f"Entropy of {x_labels_1} : {compute_entropy(x_labels_1)}")
print(f"Entropy of {x_labels_2} : {compute_entropy(x_labels_2)}")
print(f"Entropy of {x_labels_3} : {compute_entropy(x_labels_3)}")

Entropy of [0 0 0 0 0] : -0.0
Entropy of [0 1 3 2 0] : 1.3321790402101223
Entropy of [0 1 2 3 4] : 1.6094379124341005


#### Information gain


In [18]:
# Test the function
x_D_1 = np.array([0.1, 0.4, 0.3, 0.7, 0.2])
x_label_1 = np.array([0, 1, 0, 1, 0])

x_D_2 = np.array([0.1, 0.15, 0.05, 0.11, 0.12])
x_label_2 = np.array([1, 0, 0, 1, 1])

information_gain = compute_ig(x_D_1, x_label_1)
print("Information Gain:", information_gain)
information_gain = compute_ig(x_D_2, x_label_2)
print("Information Gain:", information_gain)

Information Gain: 0.6730116670092565
Information Gain: 0.22314355131420988


#### Kruskal-Wallis

In [19]:
x_D_1 = np.array([0.1, 0.4, 0.3, 0.7, 0.2])
x_label_1 = np.array([0, 1, 0, 1, 0])

x_D_2 = np.array([0.1, 0.15, 0.05, 0.11, 0.12])
x_label_2 = np.array([1, 0, 0, 1, 1])

kw_statistic = compute_kruskal_wallis_test(x_D_1, x_label_1)
print("Kruskal-Wallis Statistic:", kw_statistic)

kw_statistic = compute_kruskal_wallis_test(x_D_2, x_label_2)
print("Kruskal-Wallis Statistic:", kw_statistic)

Kruskal-Wallis Statistic: 3.0
Kruskal-Wallis Statistic: 0.3333333333333357


#### Mood's median

In [20]:
# Test the function
x_D_1 = np.array([0.1, 0.4, 0.3, 0.7, 0.2])
x_label_1 = np.array([0, 1, 0, 1, 0])

x_D_2 = np.array([0.1, 0.15, 0.05, 0.11, 0.12])
x_label_2 = np.array([1, 0, 0, 1, 1])

kw_statistic = compute_mood_median_test(x_D_1, x_label_1)
print("Mood's median Statistic:", kw_statistic)

kw_statistic = compute_mood_median_test(x_D_2, x_label_2)
print("Mood's median Statistic:", kw_statistic)

Mood's median Statistic: 2.333333333333333
Mood's median Statistic: 0.3333333333333333
