In [1]:

import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import iisignature as sig
import sigkernel
import torch
from scipy.stats import pearsonr


from time import sleep


from sklearn.preprocessing import LabelEncoder,StandardScaler,Normalizer,MinMaxScaler

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score,r2_score,mean_squared_error

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.svm import SVC,SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier


from tslearn.svm import TimeSeriesSVC,TimeSeriesSVR
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.neural_network import TimeSeriesMLPRegressor

from Neural_Decoding.preprocessing_funcs import get_spikes_with_history






In [5]:
def ml_method_setup(method, X_train, y_train,reduced = False):
        """
        This function is used to set up the machine learning method for the pipeline.

        Parameters:
                method: is the name of the method to be used, followings are the list of methods that can be used
                        ['ts_knn','ts_svc','logisticregression','svc','knnclassifier','adaboostclassifier',
                         'randomforestclassifier','r_ts_svr','r_ts_neuralnetwork''r_lassoregression','r_svr',
                         'r_randomforestregression','r_gradientboostingregression','r_neuralnetwork','r_GaussianNB'】

                X_train: is the training data
                y_train: is the training label
                reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time


        Returns:
                clf: is the machine learning method with the best hyperparameters

        """
    
        if method == 'ts_knn':
                if reduced:
                        parameters = {'n_neighbors': [1,5]}
                else:
                        parameters = {'n_neighbors': [1, 3, 5, 7],'metric': ['euclidean', 'dtw']}
                clf = GridSearchCV(KNeighborsTimeSeriesClassifier(),
                                parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'ts_svc':
                if reduced:
                        parameters = {'C': [0.1, 1, 10]}
                else:
                        parameters = {'C': [0.1, 1, 10],'kernel': ['linear', 'rbf'],'gamma': ['scale', 'auto', 0.1]}
                clf = GridSearchCV(TimeSeriesSVC(random_state=0, probability=True),
                                parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)
        
        elif method == 'r_ts_svr':
                if reduced:
                        parameters = {'C': [0.1, 1, 10]}
                else:
                        parameters = {'C': [0.1, 1, 10],'kernel': ['linear', 'rbf'],'gamma': ['scale', 'auto', 0.1]}
                clf = GridSearchCV(TimeSeriesSVR(),
                                parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'r_ts_neuralnetwork':
                if reduced:
                        parameters = {'hidden_layer_sizes': [(50,), (100,), (50, 50)]}
                else:
                        parameters = {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 
                           'activation': ['relu', 'tanh'],  
                           'learning_rate': ['constant', 'adaptive'],  
                           'alpha': [0.0001, 0.001, 0.01]}
                clf = GridSearchCV(TimeSeriesMLPRegressor(),
                                parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'logisticregression':

                lr = LogisticRegression(random_state=0)
                if reduced:
                        parameters = {'C': [0.1, 0.5, 1,  5]}
                else:
                        parameters = {'C': [0.1, 0.2, 0.5, 1, 2, 5, 10],
                                      'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
                clf = GridSearchCV(lr, parameters,cv =5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'svc':
                svc = SVC(random_state=0, probability=True)
                if reduced:
                        parameters = { 'C': [0.1, 1, 10]}
                else:
                        parameters = {'kernel': ['rbf', 'poly'], 'shrinking': [True, False],
                                'C': [0.1, 1, 10]}
                clf = GridSearchCV(svc, parameters,cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'knnclassifier':
                knn = KNeighborsClassifier()
                if reduced:
                        parameters = {'n_neighbors': range(3, 30, 2)}
                else:
                        parameters = {'n_neighbors': range(3, 30, 2), 'weights': ['uniform', 'distance']}
                clf = GridSearchCV(knn, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'adaboostclassifier':
                ada = AdaBoostClassifier(random_state=0)
                if reduced:
                        parameters = {'n_estimators': [50, 100]}
                else:
                        parameters = {'n_estimators': [50, 100], 'learning_rate': [0.5, 1, 2]}
                clf = GridSearchCV(ada, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'randomforestclassifier':
                rf = RandomForestClassifier(random_state=0)
                if reduced:
                        parameters = {'n_estimators': (100, 200)}
                else:
                        parameters = {'min_weight_fraction_leaf': [0.1, 0.5],
                                        'bootstrap': [True, False],
                                        'max_depth': (2, 5),
                                        'max_leaf_nodes': (2, 5),
                                        'n_estimators': (100, 200)}
                clf = GridSearchCV(rf, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'r_lassoregression':
                lr = Lasso()
                if reduced:
                        parameters = {'alpha': [0.1, 0.5, 1, 5]}
                else:
                        parameters = {'alpha': [0.1, 0.2, 0.5, 1, 2, 5, 10]}
                clf = GridSearchCV(lr, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)
        
        elif method == 'r_svr':
                svr = SVR()
                if reduced:
                        parameters = {'C': [0.1, 1.0, 10.0]}
                else:
                        parameters = {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf'], 
                               'gamma': ['scale', 'auto']}
                clf = GridSearchCV(svr, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'r_randomforestregression':
                rf = RandomForestRegressor()
                if reduced:
                        parameters = {'n_estimators': [50, 100]}
                else:
                        parameters =  {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
                clf = GridSearchCV(rf, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'r_gradientboostingregression':
                gb = GradientBoostingRegressor()
                if reduced:
                        parameters = {'n_estimators': [50, 100]}
                else:
                        parameters= {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
                clf = GridSearchCV(gb, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)

        elif method == 'r_neuralnetwork':
                nn = MLPRegressor()
                if reduced:
                        parameters = {'hidden_layer_sizes': [(50,), (100,), (50, 50)]}
                else:
                        parameters = {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'activation': ['relu', 'tanh'], 'alpha': [0.0001, 0.001, 0.01]}
                clf = GridSearchCV(nn, parameters, cv=5, n_jobs=-1, verbose=10)
                clf.fit(X_train, y_train)
        
        elif method == 'r_GaussianNB':
                kernel = 1.0 * RBF(length_scale=1.0)
                model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
                parameters = {'kernel__length_scale': [0.1, 1.0, 10.0],}        
                clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1,verbose=10,scoring='neg_mean_squared_error')
                clf.fit(X_train, y_train)
        else:
                clf = None

        return clf

In [5]:
def data_bins(X_train,y_train,X_test,y_test,bins_before,bins_after,bins_current):

  """
  This function is used to create bins in order to decode the data.
  See more details in https://www.eneuro.org/content/eneuro/7/4/ENEURO.0506-19.2020.full.pdf

  Parameters:
          X_train: is the training data with shape (num_timepoints, num_features)
          y_train: is the training label with shape (num_timepoints,)
          X_test: is the testing data with shape (num_timepoints, num_features)
          y_test: is the testing label with shape (num_timepoints,)

          bins_before: is the number of bins/timepoints before the current time point
          bins_after: is the number of bins/timepoints after the current time point
          bins_current: whether to include the current time point

  Returns:
          X_train: is the training data with shape (num_samples, num_bins(time), num_features)
          y_train: is the training label with shape (num_samples,)

  """

  if bins_before == 0:
    X_train =get_spikes_with_history(X_train,bins_before,bins_after,bins_current)[:-bins_after,:,:]
    X_test = get_spikes_with_history(X_test,bins_before,bins_after,bins_current)[:-bins_after,:,:]
    y_train = y_train[:-bins_after]
    y_test = y_test[:-bins_after]

  elif bins_after == 0:
    X_train =get_spikes_with_history(X_train,bins_before,bins_after,bins_current)[bins_before:,:,:]
    X_test = get_spikes_with_history(X_test,bins_before,bins_after,bins_current)[bins_before:,:,:]
    y_train = y_train[bins_before:]
    y_test = y_test[bins_before:]

  else:
    X_train =get_spikes_with_history(X_train,bins_before,bins_after,bins_current)[bins_before:-bins_after,:,:]
    X_test = get_spikes_with_history(X_test,bins_before,bins_after,bins_current)[bins_before:-bins_after,:,:]
    y_train = y_train[bins_before:-bins_after]
    y_test = y_test[bins_before:-bins_after]

  return X_train,y_train,X_test,y_test

In [6]:
def data_flatten(X_train, X_test):
    """
    This function is used to flatten the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)

    Returns:
            X_train: is the training data with shape (num_samples, num_bins(time)*num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time)*num_features)
    """
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    return X_train, X_test

In [4]:
def standard_scale_process(X_train,X_test):
    
    """
    This function is used to standardize the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time)*num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time)*num_features)

    Returns:
            X_train: is the training data after standardization with shape (num_samples, num_bins(time)*num_features)
            X_test: is the testing data after standardization with shape (num_samples, num_bins(time)*num_features)

    """

    
    data_mean = np.nanmean(X_train,axis=0)
    data_std = np.nanstd(X_train,axis=0)
    if 0 in data_std: # if there is a feature with std = 0, then the data will be divided by 0, which will cause error
      # report the error
      print('There is a feature with std = 0, please check the data')
    X_train = (X_train-data_mean)/data_std
    X_test = (X_test-data_mean)/data_std
    return X_train,X_test

In [6]:
def data_process(X_train, X_test, at, ll, scale = 'minmax'):

    """
    This function is used to preprocess the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            at: whether add time reparemeterization
            ll: whether add lead lag
            scale: whether to scale the data, can be 'minmax' or 'standard'

    Returns:
            X_train: is the training data after preprocessing with shape (num_samples, num_bins(time)*num_features)
            X_test: is the testing data after preprocessing with shape (num_samples, num_bins(time)*num_features)
    """
    
    

    # ============ Data Preprocessing ============ 
    ts_min_max_scaler = TimeSeriesScalerMinMax()

    # ============ minmax =============
    if scale == 'minmax':
        scaler = ts_min_max_scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    # ============ standard =============
    elif scale == 'standard':
        print(X_train.shape)
        X_train, X_test = standard_scale_process(X_train, X_test)
    
    else: 
        pass
    # ============ add time reparemeterization and lead lag =============
    X_train = sigkernel.transform(X_train, at=at, ll=ll, scale=.1)
    X_test = sigkernel.transform(X_test, at=at, ll=ll, scale=.1)

    return X_train, X_test

In [8]:
def signature(X_train,X_test,sig_level=2):
    """
    This function is used to calculate the signature of the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            sig_level: is the level of the signature

    Returns:
            X_train: is the training data after signature with shape (num_samples, num_bins(time)*num_features)
            X_test: is the testing data after signature with shape (num_samples, num_bins(time)*num_features)
    """
    # hape of X_train: (n_samples,n_timestamps,n_features)
    X_train = sig.sig(X_train,sig_level)
    X_test = sig.sig(X_test,sig_level)

    return X_train,X_test


In [None]:
def subsample(X_train,X_test):

    """
    This function is used to subsample the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)

    Returns:
            X_train: is the training data after subsampling with shape (num_samples, num_new_bins(time),num_features)
            X_test: is the testing data after subsampling with shape (num_samples, num_new_bins(time),num_features)
    """
    
    subsample = max(int(np.floor(X_train.shape[0]/149)),1)
    X_train = X_train[:,::subsample,:]
    X_test = X_test[:,::subsample,:]

    return X_train, X_test

In [8]:
def grid_search_atll(X_train,X_test,y_train,y_test,reduced = True):
    """
    This function is used to find the best at,ll and scaler for the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time

    Returns:
            best_at: is the best situation for whether add time reparemeterization
            best_ll: is the best situation for whether add lead lag
            best_scaler: is the best scaler

    """

    if X_train.shape[1] <= 200 and X_train.shape[2] <= 8: 
                transforms = tqdm([(True,True), (False,True), (True,False), (False,False)], position=1, leave=False)
    else: # do not try lead-lag as dimension is already high
                transforms = tqdm([(True,False), (False,False)], position=1, leave=False)

    best_score = 0
    best_at = None
    best_ll = None
    best_scaler = None
    scalers =  tqdm(['minmax', 'standard', None])
    for at, ll in transforms:
        transforms.set_description(f'at={at}, ll={ll}')
        for scaler in scalers:
            scalers.set_description(f'scaler={scaler}')
            X_train_new, X_test_new = data_process(X_train, X_test, at, ll,scaler)
            X_train_new, X_test_new = subsample(X_train_new,X_test_new)
            clf = ml_method_setup('r_ts_svr', X_train_new,y_train,reduced)
            if clf.best_score_ > best_score:
                best_score = clf.best_score_
                best_at = at
                best_ll = ll
                best_scaler = scaler

    return best_at, best_ll, best_scaler


In [9]:
def grid_search_bins(X_train,y_train,X_test,y_test,method,bin_reduce=True,reduced=True):

    """
    This function is used to find the best bins_before for the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            bin_reduce: is a boolean variable, if True, the bins_before will be reduced to save time
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time
    
    Returns:
            bin_b_best: is the best bins_before
            
    """
    if bin_reduce:
        bins_before = tqdm([10,50,100,200])
      
    else:
        bins_before = tqdm([5,10,25,50,75,100,250,500])

    r2_best = 0
    bin_b_best = 0
   

    for bin_before in bins_before:
                bins_before.set_description(f'bin_before={bin_before}')
                X_train_b,y_train_b,X_test_b,y_test_b = data_bins(X_train,y_train,X_test,y_test,bin_before,0,1)
                X_train_b,X_test_b = subsample(X_train_b,X_test_b)
                clf = ml_method_setup('r_ts_svr', X_train_b,y_train_b,reduced)
                if clf.best_score_ > r2_best:
                        bin_b_best = bin_before
                        r2_best = clf.best_score_
                   

    return bin_b_best


In [None]:
def grid_search_signature(X_train,X_test,y_train,y_test,method,reduced):
    """
    This function is used to find the best signature levelfor the data.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time

    Returns:
            best_depth: is the best signature level
            best_scale: is the best scale to multiple the data

    """
    dim  = X_train.shape[-1]
   
    if dim <= 4:
        max_depth = 6
    elif dim <= 6:
        max_depth = 5
    elif dim <= 8:
        max_depth = 4
    else:
        max_depth = 3

    _scales = [5e-2, 1e-1, 5e-1, 1e0]
    scales = tqdm(_scales, position=3, leave=False)

    # grid search on truncation levels
    depths = tqdm(range(2,max_depth+1), position=2, leave=False)

    best_score = 0
    best_depth = None
    best_scale = None

    for depth in depths:
        depths.set_description(f'depth={depth}')
        for scale in scales:
            scales.set_description(f'scale={scale}')
            # truncated signatures
            sig_train = sig.sig(scale*X_train, depth)
            clf = ml_method_setup(method, sig_train,y_train,reduced)
            if clf.best_score_ > best_score:
                best_score = clf.best_score_
                best_depth = depth
                best_scale = scale
                
    return best_depth, best_scale
                

In [None]:
def signature_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False):

    """
    This function is used to calculate the score of the signature method and will save the score in a csv file.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            file_path: is the path to save the score
            file_name: is the name of the file to save the score
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time

    Returns:
            None
    """

    start_time = time.time()

    best_depth, best_scale = grid_search_signature(X_train,X_test,y_train,y_test,method,reduced)
    X_train_sig, X_test_sig = signature(X_train*best_scale, X_test*best_scale, best_depth)

    clf_sig = ml_method_setup(method, X_train_sig, y_train, reduced)
    y_pred_sig = clf_sig.predict(X_test_sig)
    r2_sig = r2_score(y_test, y_pred_sig)
    r_value_sig, p_value_sig = pearsonr(y_test, y_pred_sig)
    
    end_time = time.time()
    time_sig = end_time - start_time

    score_sig = pd.DataFrame({'r2':r2_sig, 'rvalue':r_value_sig, 'time':time_sig}, index=[0])
    score_sig.to_csv(f"{file_path}{file_name}_sig.csv", index=False)

In [None]:
def flatten_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False):
    """
    This function is used to calculate the score of the flatten method and will save the score in a csv file.
    
    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            file_path: is the path to save the score
            file_name: is the name of the file to save the score
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time

    Returns:
            None
    
    """

    start_time = time.time()

    X_train_flat, X_test_flat = data_flatten(X_train, X_test)
    clf_flat = ml_method_setup(method, X_train_flat, y_train, reduced)
    y_pred_flat = clf_flat.predict(X_test_flat)
    r2_flat = r2_score(y_test, y_pred_flat)
    r_value_flat, p_value_flat = pearsonr(y_test, y_pred_flat)
    
    end_time = time.time()
    time_flat = end_time - start_time

    score_flat = pd.DataFrame({'r2':r2_flat, 'rvalue':r_value_flat, 'time':time_flat}, index=[0])
    score_flat.to_csv(f"{file_path}{file_name}_flat.csv", index=False)

In [None]:
def ts_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False):
    """

    This function is used to calculate the score of the ts method and will save the score in a csv file.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            file_path: is the path to save the score
            file_name: is the name of the file to save the score
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time

    Returns:
            None
            
    """

    start_time = time.time()

    clf = ml_method_setup(method, X_train, y_train, reduced)
    y_pred = clf.predict(X_test)
    r2_ts = r2_score(y_test, y_pred)
    r_value_ts, p_value_ts = pearsonr(y_test, y_pred)
    
    end_time = time.time()
    time_ts = end_time - start_time

    score_ts = pd.DataFrame({'r2':r2_ts, 'rvalue':r_value_ts, 'time':time_ts}, index=[0])
    score_ts.to_csv(f"{file_path}{file_name}_ts.csv", index=False)

In [None]:
def sigkernel_score(X_train,X_test,y_train,y_test,method,file_path,dataset):

    """
    This function is used to calculate the score of the sigkernel method and will save the score in a csv file.

    Parameters:
            X_train: is the training data with shape (num_samples, num_bins(time), num_features)
            X_test: is the testing data with shape (num_samples, num_bins(time), num_features)
            y_train: is the training label with shape (num_samples,)
            y_test: is the testing label with shape (num_samples,)
            method: is the name of the method to be used, followings are the list of methods that can be use
            file_path: is the path to save the score
            dataset: is the name of the dataset

    Returns:
            None
    """
    file_name = f"{dataset}"
    _sigmas = [1e-2, 2.5e-2, 5e-2, 7.5e-2, 1e-1, 1.5e-1, 
               2e-1, 2.5e-1, 3e-1, 3.5e-1, 4e-1, 4.5e-1, 
               5e-1, 5.5e-1, 6e-1, 6.5e-1, 7e-1, 7.5e-1, 
               8e-1, 8.5e-1, 9e-1, 9.5e-1, 1.]
    start_time = time.time()

    if X_train.shape[0] <= 150 and X_train.shape[1] <=150 and X_train.shape[2] <= 8:
                    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                    dtype = torch.float32
    else: # otherwise do computations in cython
        device = 'cpu'
        dtype = torch.float64

    X_train = torch.tensor(X_train, dtype=dtype, device=device)
    X_test = torch.tensor(X_test, dtype=dtype, device=device)

    best_scores = 0
    best_sigma = None
    
    for sigma in tqdm(_sigmas):

            # define static kernel
            static_kernel = sigkernel.RBFKernel(sigma=sigma)

            # initialize corresponding signature PDE kernel
            signature_kernel = sigkernel.SigKernel(static_kernel, dyadic_order=0)

            # compute Gram matrix on train data
            G_train = signature_kernel.compute_Gram(X_train, X_train, sym=True).cpu().numpy()

            if method.startswith('r_'):
                clf = ml_method_setup('r_svr', G_train, y_train, False)
            else:
                clf = ml_method_setup('svc', G_train, y_train, False)

            del G_train

            if clf.best_score_ > best_scores:
                best_scores = clf.best_score_
                best_sigma = sigma
            
    # define static kernel
    static_kernel = sigkernel.RBFKernel(sigma = best_sigma)
    # initialize corresponding signature PDE kernel
    signature_kernel = sigkernel.SigKernel(static_kernel, dyadic_order=0)
    # compute Gram matrix on test data
    G_test = signature_kernel.compute_Gram(X_test, X_train, sym=False).cpu().numpy()

    y_pred = clf.predict(G_test)
    r2_sigkernel = r2_score(y_test, y_pred)
    r_value_sigkernel, p_value_sigkernel = pearsonr(y_test, y_pred)
    end_time = time.time()
    time_sigkernel = end_time - start_time

    score_sigkernel = pd.DataFrame({'r2':r2_sigkernel, 'rvalue':r_value_sigkernel, 'time':time_sigkernel}, index=[0])
    score_sigkernel.to_csv(f"{file_path}{file_name}_sigkernel.csv", index=False)
    


In [7]:
def auto(X_train, X_test, y_train, y_test, 
             dataset, method, 
             reduced, bin_reduce = True,
             file_path = '.'):
    
    """
    This function is used to run the auto pipeline.

    Parameters:
            X_train: is the training data. 
                     For classification, it will have the shape (num_samples, num_timepoints, num_features). 
                     For regression, it will have the shape (num_timepoints, num_features)

            X_test: is the testing data.
                        For classification, it will have the shape (num_samples, num_timepoints, num_features).
                        For regression, it will have the shape (num_timepoints, num_features)

            y_train: is the training label.
                        For classification, it will have the shape (num_samples,).
                        For regression, it will have the shape (num_timepoints,)

            y_test: is the testing label.
                        For classification, it will have the shape (num_samples,).
                        For regression, it will have the shape (num_timepoints,)

            dataset: is the name of the dataset
            method: is the name of the method to be used, followings are the list of methods that can be use
            reduced: is a boolean variable, if True, the hyperparameters will be reduced to save time
            bin_reduce: is a boolean variable, if True, the bins_before hyperparameters will be reduced to save time
            file_path: is the path to save the score

    Returns:
            None
    """

    file_name = f"{dataset}_{method}"
   

    if method.startswith('r_'):
        
        # ============ grid search for bins ============
        bin_before = grid_search_bins(X_train,y_train,X_test,y_test,method,bin_reduce,reduced)
        X_train,y_train,X_test,y_test = data_bins(X_train,y_train,X_test,y_test,bin_before,0,1)
        

        # ========= grid search for ll & at & scale ================
        best_at, best_ll, best_scaler = grid_search_atll(X_train,X_test,y_train,y_test,reduced)
        X_train, X_test = data_process(X_train, X_test, best_at, best_ll, best_scaler)

        # ========= subsample ================
        X_train, X_test = subsample(X_train,X_test)

        if best_scaler == 'minmax':
            file_name += "_minmax"
        elif best_scaler == 'standard':
            file_name += "_standard"
        else:
            file_name += "_none"

        if best_at:
            file_name += "_at"
        if best_ll:
            file_name += "_ll"
    

        if method.startswith('r_ts'):
            ts_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)

        else:
            # ==== signature ====
            signature_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)
            # ==== flatten =====
            flatten_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)
            # ==== sigkernel ====
            sigkernel_score(X_train,X_test,y_train,y_test,method,file_path,dataset)

    else:
        # ========= grid search for ll & at & scale ================
        best_at, best_ll, best_scaler = grid_search_atll(X_train,X_test,y_train,y_test,reduced)
        X_train, X_test = data_process(X_train, X_test, best_at, best_ll, best_scaler)
        X_train, X_test = subsample(X_train,X_test)

        if best_scaler == 'minmax':
            file_name += "_minmax"
        elif best_scaler == 'standard':
            file_name += "_standard"
        else:
            file_name += "_none"

        if best_at:
            file_name += "_at"
        if best_ll:
            file_name += "_ll"
        
        if method.startswith('ts'):
            ts_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)


        else:
            # ====== signature ======
            signature_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)

            # ====== flatten ======
            flatten_score(X_train,X_test,y_train,y_test,method,file_path,file_name,reduced = False)

            # ====== sigkernel ======
            sigkernel_score(X_train,X_test,y_train,y_test,method,file_path,dataset)

    
    