In [1]:
import shutil
import os
import pandas as pd
from os.path import dirname
import numpy as np
import pickle
from sklearn import preprocessing
from keras.optimizers import Adagrad
import matplotlib.pyplot as plt
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict
from sklearn.metrics import accuracy_score




def covert_list_to_dic(XX_list):
    result_dic ={}
    for i,item in enumerate(XX_list):
        result_dic[item]=i
    return result_dic

def load_st_result(name, ratio_number_list, ind_number_list  = [0,1,2,3,4,5,6,7,8,9]):
    father_path = './ST_acc_log/' + name
    path = father_path + '/' + name + '_log.txt'
    ratio_dic = covert_list_to_dic(ratio_number_list)
    ind_dic = covert_list_to_dic(ind_number_list)
    proto_result = np.zeros([len(ratio_number_list), len(ind_number_list)])
    try:
        f = open(path, "r")
        for x in f:
            temp = x.split('\t')
            if float(temp[3]) == 1:
                proto_result[-1:, :] = np.ones((1, len(ind_number_list))) * float(temp[-1])
                continue

            proto_result[ratio_dic[float(temp[3])]][ind_dic[float(temp[5])]] = float(temp[-1])
    except:
        print(name, 'cannot find')

    return proto_result


def save_hyper_parameter_to_log(sentence, dataset_name):
    father_path = './ST_hyper_parameter_log/' + dataset_name
    if os.path.exists(father_path):
        shutil.rmtree(father_path)
    if not os.path.exists(father_path):
        os.makedirs(father_path)
    path = father_path + '/' + dataset_name + '_log.txt'
    with open(path, "a") as myfile:
        myfile.write(sentence + '\n')

def load_hyper_parameter_log(dataset_name):
    father_path = './ST_hyper_parameter_log/' + dataset_name
    path = father_path + '/' + dataset_name + '_log.txt'
    try:
        f = open(path, "r")
        for x in f:
            return str(x)
    except:
        print(name, 'cannot find')



def save_to_log(sentence, dataset_name):
    father_path = './ST_acc_log/' + dataset_name
    if not os.path.exists(father_path):
        os.makedirs(father_path)
    path = father_path + '/' + dataset_name + '_log.txt'
    with open(path, "a") as myfile:
        myfile.write(sentence + '\n')

def TSC_data_loader(dataset_name):
    Train_dataset = np.loadtxt(
        dirname(os.getcwd()) + '/datasets/UCRArchive_2018/' + dataset_name + '/' + dataset_name + '_TRAIN.tsv')
    Test_dataset = np.loadtxt(
        dirname(os.getcwd()) + '/datasets/UCRArchive_2018/' + dataset_name + '/' + dataset_name + '_TEST.tsv')
    Train_dataset = Train_dataset.astype(np.float32)
    Test_dataset = Test_dataset.astype(np.float32)

    X_train = Train_dataset[:, 1:]
    y_train = Train_dataset[:, 0:1]

    X_test = Test_dataset[:, 1:]
    y_test = Test_dataset[:, 0:1]
    le = preprocessing.LabelEncoder()
    le.fit(np.squeeze(y_train, axis=1))
    y_train = le.transform(np.squeeze(y_train, axis=1))
    y_test = le.transform(np.squeeze(y_test, axis=1))

    # X_train = np.nan_to_num(X_train)
    # X_test  = np.nan_to_num(X_test)

    return X_train, y_train, X_test, y_test


def shapelet_best_predict(X_train, y_train, X_test, y_test):
    X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
    X_test = TimeSeriesScalerMinMax().fit_transform(X_test)
    
    r_list = [2,4,6,8,12]
    result_list = [0 for i in range(len(r_list))]
    for r_number_index, r_number in enumerate(r_list):
        try:
            shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=X_train.shape[0],
                                                       ts_sz=X_train.shape[1],
                                                       n_classes=len(set(y_train)),
                                                       l=0.1,
                                                       r=r_number)

            shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                            optimizer=Adagrad(lr=.1),
                            weight_regularizer=.01,
                            max_iter=50,
                            verbose_level=0)
            shp_clf.fit(X_train, y_train)
            y_predict = shp_clf.predict(X_train)
            result_list[r_number_index] = accuracy_score(y_predict,y_train)
        except:
            print("error in ",r_number)
    
    best_r_ind =np.argmax(result_list)  
    shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=X_train.shape[0],
                                                       ts_sz=X_train.shape[1],
                                                       n_classes=len(set(y_train)),
                                                       l=0.1,
                                                       r=r_list[best_r_ind])
    
    shp_clf.fit(X_train, y_train)
    y_predict = shp_clf.predict(X_test)
    result = accuracy_score(y_predict,y_test)
    return result , r_list[best_r_ind]
    

def shapelet_predict(X_train, y_train, X_test, y_test,r_number):
    X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
    X_test = TimeSeriesScalerMinMax().fit_transform(X_test)
    shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=X_train.shape[0],
                                                       ts_sz=X_train.shape[1],
                                                       n_classes=len(set(y_train)),
                                                       l=0.1,
                                                       r=r_number)

    shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                            optimizer=Adagrad(lr=.1),
                            weight_regularizer=.01,
                            max_iter=50,
                            verbose_level=0)
    shp_clf.fit(X_train, y_train)
    y_predict = shp_clf.predict(X_test)
    acc = accuracy_score(y_predict,y_test)
    
    return acc 



np.random.seed(0) 
name_list = [
    'ECG200',
]    

train_ratio_list = [0.1,0.2,0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for name in name_list:
    print(name)
    X_train_ori, y_train_ori, X_test, y_test= TSC_data_loader(name)
    result , shapelet_number = shapelet_best_predict(X_train_ori, y_train_ori, X_test, y_test)
    save_hyper_parameter_to_log(str(shapelet_number), name)
    print('best shapelet_length for dataset',name, 'is', shapelet_number)
    
    
    
    

Using TensorFlow backend.


ECG200









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where










  n_shapelets = int(numpy.log10(n_ts * (ts_sz - shp_sz + 1) * (n_classes - 1)))


error in  12
best shapelet_length for dataset ECG200 is 8
