# Eksperymenty badające podobieństwo między typami UDF

Eksperyment bierze pod uwagę 10 powtórzeń dla każdego algorytmu (poza KNN, przez brak wpływu 'random state' na wynik). Wyniki zostaną zsumowane i obliczona zostanie średnia z każdego eksperymentu. 
### Deklaracja zmiennych i funkcji użytkowych

In [None]:
import time
import pandas as pd
import os
from sktime.datasets import load_UCR_UEA_dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV2
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier

DATA_PATH = os.path.abspath("./../MachineLearning/ts_datasets")
RESULTS_PATH = "./../MachineLearning/results"
datasets = ["Default", "Default_smooth", "Normalized", "Normalized_smooth"]
init_seed = 42

def run_experiment(clf, X_train, y_train, X_test, y_test, dataset, seed):
    """Uruchomienie pojedyńczego eksperymentu. Funkcja ta ma na celu zjednolicenie sposobu liczenia czasu przetwarzania niezależnie od algorytmu.

    Args:
        clf: Klasyfikator, bazujący na bibliotece sklearn
        X_train pandas.DataFrame: Szeregi użyte do trenowania algorytmu bez etykiety
        y_train numpy.array:  Szeregi użyte do testowania algorytmu bez etykiety
        X_test pandas.DataFrame: Etykiety do szeregów użytych do trenowania
        y_test numpy.array: Etykiety do szeregów użytych do testowania
        dataset str: Nazwa zbioru danych, który jest testowany
        seed int: Wartość ziarna dla wartości losowych

    Returns:
        pandas.DataFrame: Wynik testowania algorytmu w postaci pandas.dataframe
    """
    start = time.time()
    clf.fit(X_train, y_train)
    predict_result = clf.predict(X_test)
    end = time.time()

    acc_score = accuracy_score(y_test, predict_result)
    f1_measure = f1_score(y_test, predict_result, average='weighted')
    time_elapsed = (end - start)

    return pd.DataFrame([[dataset, seed, acc_score, f1_measure, time_elapsed, predict_result, y_test]], 
                        columns = ["dataset", "seed", "accuracy_score", "f1_measure", "execution_time", "y_predict", "y_true"])

## ROCKET experiments

In [None]:

for dataset in datasets:
    base_results_path = f"{RESULTS_PATH}/type/{dataset}/Rocket"
    result_df = pd.DataFrame()
    print(f"Started {dataset}")
    X_train, y_train = load_UCR_UEA_dataset(name=dataset, split="train", extract_path = DATA_PATH)
    X_test, y_test = load_UCR_UEA_dataset(name=dataset, split="test", extract_path = DATA_PATH)
    
    if not os.path.exists(base_results_path):
        os.makedirs(base_results_path)
        
    for seed in range(init_seed, init_seed+10):
        progress_string = "#".rjust(seed-init_seed, "#").ljust(9, '-')

        if os.path.exists(f"{base_results_path}/ROCKET_results_{seed}.csv"):
            print(f"{progress_string} ({seed}/{init_seed+9})")
            continue

        current_result = run_experiment(RocketClassifier(rocket_transform="multirocket", random_state = seed, n_jobs = 8), 
            X_train, y_train, X_test, y_test, dataset, seed)
        
        current_result.to_csv(f"{base_results_path}/ROCKET_results_{seed}.csv")
        
        result_df = pd.concat([result_df, current_result], ignore_index=True)
        print(f"{progress_string} ({seed}/{init_seed+9})")

## HIVECOTEV2 experiments

In [None]:
for dataset in datasets:
    base_results_path = f"{RESULTS_PATH}/type/{dataset}/HIVE"
    result_df = pd.DataFrame()
    print(f"Started {dataset}")
    X_train, y_train = load_UCR_UEA_dataset(name=dataset, split="train", extract_path = DATA_PATH)
    X_test, y_test = load_UCR_UEA_dataset(name=dataset, split="test", extract_path = DATA_PATH)
    
    if not os.path.exists(base_results_path):
        os.makedirs(base_results_path)
        
    for seed in range(init_seed, init_seed+10):
        progress_string = "#".rjust(seed-init_seed, "#").ljust(9, '-')

        if os.path.exists(f"{base_results_path}/HIVE_results_{seed}.csv"):
            print(f"{progress_string} ({seed}/{init_seed+9})")
            continue

        current_result = run_experiment(HIVECOTEV2(random_state = seed, time_limit_in_minutes=5), 
            X_train, y_train, X_test, y_test, dataset, seed)
        
        current_result.to_csv(f"{base_results_path}/HIVE_results_{seed}.csv")
        
        result_df = pd.concat([result_df, current_result], ignore_index=True)
        print(f"{progress_string} ({seed}/{init_seed+9})")

## KNN-DTW experiments

In [None]:
for dataset in datasets:
    base_results_path = f"{RESULTS_PATH}/type/{dataset}/KNN"
    result_df = pd.DataFrame()
    print(f"Started {dataset}")
    X_train, y_train = load_UCR_UEA_dataset(name=dataset, split="train", extract_path = DATA_PATH)
    X_test, y_test = load_UCR_UEA_dataset(name=dataset, split="test", extract_path = DATA_PATH)
    progress = 1 
    if not os.path.exists(base_results_path):
        os.makedirs(base_results_path)
        
    for k in range(1, 6, 2):
        progress_string = "#".rjust(progress, "#").ljust(3, '-')

        if os.path.exists(f"{base_results_path}/KNN_results_{k}.csv"):
            print(f"{progress_string} ({progress}/{3})")
            progress += 1
            continue

        current_result = run_experiment(KNeighborsTimeSeriesClassifier(n_neighbors=k, distance="dtw", n_jobs = 16), 
            X_train, y_train, X_test, y_test, dataset, k)
        
        current_result.to_csv(f"{base_results_path}/KNN_results_{k}.csv")
        
        result_df = pd.concat([result_df, current_result], ignore_index=True)
        print(f"{progress} ({progress}/{3})")
        progress += 1