# Exploring Time Series Classification Performance on Various Datasets

This Jupyter notebook evaluates the performance of different classification models on a variety of time series datasets. It includes experiments with classifiers such as Naive Bayes, Random Forest, Support Vector Machine (SVM), k-Nearest Neighbors (k-NN), and Decision Tree.

In [1]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import json
import time

from utils.shapelets_transform import *
from utils.quality_measures import *

In [2]:
processed_datasets_folder = "datasets/preprocessed_datasets"

## Classification models

In [3]:
# Initialize models
models = {
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM (Linear)": SVC(kernel='linear'),
    "1-NN": KNeighborsClassifier(n_neighbors=1),
    "Decision Tree": DecisionTreeClassifier(),
}

In [4]:
def train_models(x_train, y_train, x_test, y_test):
    # Train and evaluate models
    results = {}
    for name, model in models.items():
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        results[name] = accuracy
        
    return results

## Datasets

In [14]:
dataset_to_filenames = {"GunPointAgeSpan" : processed_datasets_folder + '/gun_nogun.npz',
                        "Synthetic" : processed_datasets_folder + '/synthetic_dataset.npz',}
                        "ECG200" : processed_datasets_folder + '/ecgs.npz',}
                        "ECG Five Days" : processed_datasets_folder + '/ecg5days.npz',
                        "Two Lead ECG" : processed_datasets_folder + '/twoleadecg.npz',
                        "Mote Strain" : processed_datasets_folder + '/motestrain.npz',
                        "Sony Robot" : processed_datasets_folder + '/sonyrobot.npz',
                        "Beef" : processed_datasets_folder + '/beef.npz',}

"""
min_shapelet: The minimum length of shapelets considered.
max_shapelet: The maximum length of shapelets considered.
x_num_shapelets: A list of different numbers of shapelets to be used in the experiments.
x_num_clusters: A list of different numbers of clusters for shapelet clustering experiments.
"""
dataset_to_parameters = {"GunPointAgeSpan" : 
                              {'min_shapelet' : 30,
                               'max_shapelet' : 50,
                               'x_num_shapelets' : [60, 30, 10, 5],
                               'x_num_clusters' : [10, 5]},
                         "Synthetic" : 
                              {'min_shapelet' : 13,
                               'max_shapelet' : 30,
                               'x_num_shapelets' : [150, 75, 30, 10, 5],
                               'x_num_clusters' : [10, 5]},
                         "ECG200" : 
                              {'min_shapelet' : 20,
                               'max_shapelet' : 40,
                               'x_num_shapelets' : [50, 25, 10, 5],
                               'x_num_clusters' : [10, 5]},
                         "Beef" : 
                              {'min_shapelet' : 40,
                               'max_shapelet' : 60,
                               'x_num_shapelets' : [15, 10, 5],
                               'x_num_clusters' : [8, 5]},
                         "ECG Five Days" : 
                              {'min_shapelet' : 10,
                               'max_shapelet' : 40,
                               'x_num_shapelets' : [10, 5],
                               'x_num_clusters' : [7, 3]},
                         "Two Lead ECG" : 
                              {'min_shapelet' : 10,
                               'max_shapelet' : 30,
                               'x_num_shapelets' : [10, 5],
                               'x_num_clusters' : [7, 3]},
                         "Mote Strain" : 
                              {'min_shapelet' : 10,
                               'max_shapelet' : 30,
                               'x_num_shapelets' : [10, 5],
                               'x_num_clusters' : [7, 3]},
                         "Sony Robot" : 
                              {'min_shapelet' : 7,
                               'max_shapelet' : 25,
                               'x_num_shapelets' : [13, 10, 5],
                               'x_num_clusters' : [8, 5]},}

## Quality measures

In [15]:
quality_measures = {"F_stat" : compute_f_stat,
                    "Information Gain" : compute_ig,
                    "Kruskal-Wallis test" : compute_kruskal_wallis_test,
                    "Mood's median test" : compute_mood_median_test}

## Experiments

In [16]:
def run_experiment(filename, parameters, quality_measure, dataset_name, quality_measure_name):
    """
    Runs an experiment to evaluate shapelet-based time series classification.

    Args:
    filename (str): The path to a preprocessed dataset file in NumPy format.
    parameters (dict): A dictionary containing experiment parameters.
    quality_measure (function): A function to evaluate the quality of a shapelet.
    dataset_name (str): The name of the dataset being used for the experiment.
    quality_measure_name (str): The name of the quality measure being used.

    Returns:
    dict: A dictionary containing experiment results, including shapelet selection, transformation,
          and classification performance under various conditions.
    """
    data = np.load(filename)
    x_train = data['array1']
    y_train = data['array2']
    x_test = data['array3']
    y_test = data['array4']
    result = {}
    
    Measure time for estimating min and max shapelet lengths
    start_time = time.time()
    result['estimated_min_max'] = estimate_min_and_max(x_train, y_train, quality_measure)
    result['estimated_min_max_duration'] = time.time() - start_time
    
    # Select shapelets using quality measure
    start_time = time.time()
    x_shapelet = shapelet_cached_selection(x_train, y_train, parameters['min_shapelet'], parameters['max_shapelet'], len(x_train)//2, compute_f_stat, verbose=1)
    result['shapelet_selection_duration'] = time.time() - start_time
    
    # Store selected shapelets in a JSON file
    x_shapelet_list = []
    x_shapelet_list_list = []
    for shap in x_shapelet:
        x_shapelet_list.append(shap[0])
        x_shapelet_list_list.append(list(shap[0]))
    with open(f'results/shapelets/{dataset_name}_{quality_measure_name}.json', 'w') as json_file:
        json.dump(x_shapelet_list_list, json_file)
        
    # Evaluate models with different numbers of shapelets
    for num_shapelets in parameters['x_num_shapelets']:
        x_train_transformed = shapelets_transform(x_shapelet_list[:num_shapelets], x_train)
        x_test_transformed = shapelets_transform(x_shapelet_list[:num_shapelets], x_test)
        result[f'With {num_shapelets} shapelets'] = train_models(x_train_transformed, y_train, x_test_transformed, y_test)
        
    # Cluster shapelets and evaluate models with different numbers of clusters
    for num_clusters in parameters['x_num_clusters']:
        clusters = cluster_shapelets(x_shapelet_list, num_clusters)
        list_cluster = []
        for c in clusters:
            list_cluster.append(list(c))
        x_train_transformed = shapelets_cluster_transform(list_cluster, x_train)
        x_test_transformed = shapelets_cluster_transform(list_cluster, x_test)
        result[f'With {num_clusters} clusters'] = train_models(x_train_transformed, y_train, x_test_transformed, y_test)
    
    # Evaluate models using raw data
    result['On raw data'] = train_models(x_train, y_train, x_test, y_test)
    
    return result

## Main loop

In [17]:
result = {}

for dataset_name in dataset_to_filenames.keys():
    print(f"{dataset_name} ...")
    filename = dataset_to_filenames[dataset_name]
    parameters = dataset_to_parameters[dataset_name]
    result[dataset_name] = {}
    
    for quality_measure_name in quality_measures.keys():
        print(quality_measure_name)
        result[dataset_name][quality_measure_name] = run_experiment(filename, parameters, quality_measures[quality_measure_name], dataset_name, quality_measure_name)
        with open(f'results/classification/{dataset_name}_{quality_measure_name}.json', 'w') as json_file:
            json.dump(result, json_file)

Synthetic ...
F_stat


100%|██████████| 100/100 [00:49<00:00,  2.00it/s]


Information Gain


100%|██████████| 100/100 [00:51<00:00,  1.94it/s]


Kruskal-Wallis test


100%|██████████| 100/100 [00:51<00:00,  1.93it/s]


Mood's median test


100%|██████████| 100/100 [00:49<00:00,  2.02it/s]
