# Experiment on Different Time-series Similarity Measures 1

### Loading

In [1]:
# load image and libraries
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np
from itertools import groupby
from scipy import signal
from sklearn import preprocessing
import pandas as pd

plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['figure.dpi'] = 150 # 200 e.g. is really fine, but slower

root_path = './processed_datasets/'
datasets = ['has', 'sp', 'fp', 'rb', 'sd', 'sr', 'hasb', 'ihas']

# load ground truth
ori_data_X = []
ori_data_y = []
sketch_X = []
for dataset in datasets:
    file_name = root_path + 'original_' + dataset  
    ori_data_X.append(np.load(file_name + '_X' + '.npy'))
    ori_data_y.append(np.load(file_name + '_y' + '.npy'))
    file_name = root_path + 'sketch_' + dataset + '.npy'
    sketch_X.append(np.load(file_name, allow_pickle=True)[:100])
print(f"number of loaded samples per class: {[len(x) for x in sketch_X]}")
print(f"Original data: {len(ori_data_X)} datasets")
print(f"Sketch data: {len(sketch_X)} datasets")


number of loaded samples per class: [100, 100, 100, 100, 100, 100, 100, 100]
Original data: 8 datasets
Sketch data: 8 datasets


### Sliding Window

In [2]:
def sliding_window(ori_series, clip_series, measure):
    """
    Compute the similarities of the original and the clipped series using sliding window
    input: original time series, clipped series, similarity measure function
    output: similarity_distribution, matching result, i.e., starting and ending points
    """
    ori_len = ori_series.shape[0]
    clip_len = clip_series.shape[0]
    if ori_len < clip_len:
        return None

    # compute the similarity between the original and the clipped series
    dist = []
    # compute the similarity between the original and the clipped series using sliding window
    for i in range(ori_len - clip_len + 1):
        dist.append(measure(ori_series[i:i+clip_len], clip_series))
    # find the maximum similarity and the corresponding starting and ending points
    min_idx = np.argmin(dist)
    return dist, [min_idx, min_idx + clip_len - 1]


In [3]:
def pointwise_exp(ori_data_X, ori_data_y, sketch_X, measure):
    results = []
    for i, dataset in enumerate(datasets):
        original = ori_data_X[i]
        label = ori_data_y[i]
        dummy_record = []
        for sample in sketch_X[i]:
            clip =  signal.resample(sample, label[1]-label[0]+1)
            sim_dist, pred_loc = sliding_window(original, clip, measure)
            dummy_record.append([sim_dist, pred_loc])
        results.append(dummy_record)
    return results

### Define Metrics

#### Euclidean Distance

In [4]:
def euclidean_distance(x,y):
    t = preprocessing.Normalizer()
    x = np.expand_dims(x, axis=0)
    y = np.expand_dims(y, axis=0)
    return np.linalg.norm(t.transform(x) - t.transform(y))

#### DTW
Based on tslearn: https://github.com/tslearn-team/tslearn/

In [5]:
from tslearn.metrics import dtw, cdist_dtw
def dtw_distance(x,y):
    t = preprocessing.Normalizer()
    x = np.expand_dims(x, axis=0)
    y = np.expand_dims(y, axis=0)
    return dtw(t.transform(x)[0], t.transform(y)[0])

### MASS Algorithm (if having time to examine..)
https://github.com/matrix-profile-foundation/mass-ts

### Qetch Algorithm

In [7]:
#Qetch Algorithm -- inclomplete --

def width(series):
    # Should return width of series


    return 
    
def height(series):
    # Should return width of series

    return 

def get_LDE(sketch_split,Candidate_split,Gx,Gy):
    Rx = width(Candidate_split)/(Gx * width(sketch_split))    
    Ry = height(Candidate_split)/(Gy * height(sketch_split))  
    return (np.log(Rx)**2)+(np.log(Ry)**2)

def get_ShapeError(Sketch,Candidate,Gx,Gy,k):


    return 
def calculatDistance(Sketch, Candidate,k):
    # Calculating Global non uniform Scaling facctors
    Gx = width(Candidate)/width(Sketch)
    Gy = height(Candidate)/height(Sketch)
    # Calculating Shape error
    SE = get_ShapeError(Sketch,Candidate,Gx,Gy)
    sketch_split = np.split(Sketch,k)
    Candidate_split = np.split(Candidate,k)
    # Calculating Local distortion errors
    LDE = 0
    for i in range(k):
        LDE += get_LDE(Sketch[i],Candidate[i],Gx,Gy,k)

    # Calculating total error
    Dist = LDE + SE
    return Dist


### Experiment 1: Matching Performance

#### Analysis

In [8]:
def analyze_results(results, plot = True, ori_data_X=ori_data_X, ori_data_Y=ori_data_y, datasets=datasets):
    d = {'Dataset':[], 'Number of samples':[], 'Average distance':[], 'Average location error (%)':[]}
    
    for i in range(len(results)):
        d['Dataset'].append(datasets[i])
        d['Number of samples'].append(len(results[i]))
        d['Average distance'].append(np.mean([np.max(x[0]) for x in results[i]]))
        d['Average location error (%)'].append(np.mean([np.abs(x[1][0] - ori_data_y[i][0])/ ori_data_X[i].shape[0]*100 for x in results[i]]))
        dummy = {"Dataset": datasets[i], "Number of samples": len(results[i]), "Average distance": np.mean([np.max(x[0]) for x in results[i]]),
                 "Average location error (%)": np.mean([np.abs(x[1][0] - ori_data_y[i][0])/ ori_data_X[i].shape[0]*100 for x in results[i]])}
    df = pd.DataFrame(d)
    display(df)
    display(df.iloc[:,[2,3]].describe())
    print(f'Average location error w.r.t. total length: {np.mean([np.mean([np.abs(x[1][0] - ori_data_y[i][0])/ ori_data_X[i].shape[0]*100 for x in results[i]]) for i in range(len(results))])}%')
    print('--------------------------------------------------------')
    if plot:
        plt.figure()
        for i in range(len(results)):
            ax = plt.subplot(3,3,i+1)
            plt.tight_layout()
            plt.title(f"Dataset: {datasets[i]}")
            plt.xlabel("Time")
            plt.ylabel("Distance")
            plt.plot(results[i][0][0])
            plt.plot(results[i][0][1], [0.5, 0.5], marker='*', ls='none')
            plt.plot(ori_data_y[i], [0.5, 0.5], marker='o', color='r', ls='none')
        plt.legend(['Distance', 'Predicted', 'Ground Truth'],bbox_to_anchor=(2,0), loc='lower right')
        plt.show()
    return df

#### Get results

In [9]:
measures = [euclidean_distance, dtw_distance]
for measure in measures:
    print(f"Measure: {measure.__name__}")
    results = pointwise_exp(ori_data_X, ori_data_y, sketch_X, measure)
    df = analyze_results(results, plot = False)

Measure: euclidean_distance


Unnamed: 0,Dataset,Number of samples,Average distance,Average location error (%)
0,has,100,0.86882,7.251208
1,sp,100,0.894325,17.347826
2,fp,100,1.026507,12.874396
3,rb,100,1.139434,1.932367
4,sd,100,0.586013,3.045894
5,sr,100,1.239033,23.241546
6,hasb,100,0.669684,8.983092
7,ihas,100,0.752768,10.975904


Unnamed: 0,Average distance,Average location error (%)
count,8.0,8.0
mean,0.897073,10.706529
std,0.227572,7.137291
min,0.586013,1.932367
25%,0.731997,6.199879
50%,0.881572,9.979498
75%,1.054739,13.992754
max,1.239033,23.241546


Average location error w.r.t. total length: 10.706529014609162%
--------------------------------------------------------
Measure: dtw_distance


Unnamed: 0,Dataset,Number of samples,Average distance,Average location error (%)
0,has,100,0.678415,7.176329
1,sp,100,0.621693,12.927536
2,fp,100,0.707616,11.31401
3,rb,100,0.858138,1.55314
4,sd,100,0.442453,2.927536
5,sr,100,1.058595,16.120773
6,hasb,100,0.565764,19.599034
7,ihas,100,0.593285,2.886747


Unnamed: 0,Average distance,Average location error (%)
count,8.0,8.0
mean,0.690745,9.313138
std,0.190977,6.719575
min,0.442453,1.55314
25%,0.586405,2.917339
50%,0.650054,9.245169
75%,0.745247,13.725845
max,1.058595,19.599034


Average location error w.r.t. total length: 9.313138059484315%
--------------------------------------------------------
