# Zanalizowanie i wizualizacja wyników eksperymentów badających podobieństwo między typami UDF

Każdy z algorytmów został poddany takiej samej analizie. Wpierw liczona jest średnia dla każdego z typów algorytmów. Następnie tworzony jest boxplot sprawdzający wpływ wartości losowej na wyniki. Na końcu liczona jest macierz pomyłek dla wyników, który jest najbardziej zbliżony do wyniku średniego.

Dodatkowo, na końcu wykonana jest analiza porównawcza każdego z badanych algorytmów.
### Deklaracja zmiennych użytkowych

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from plotnine import *

RESULTS_PATH = "./../MachineLearning/results/type"
PLOTS = "./../Plots/MachineLearning"
datasets = ["Default", "Default_smooth", "Normalized", "Normalized_smooth"]
class_names = ["aggregation", "filtering", "filtering-aggregation", "filtering-aggregation-join", "filtering-join"]

# ROCKET

In [None]:
algorythm = "Rocket"
if not os.path.exists(f"{PLOTS}/{algorythm}"):
    os.makedirs(f"{PLOTS}/{algorythm}")
dataframes = []
for dataset in datasets:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df["F-miara"] = df.f1_measure
        df["trafność"] = df.accuracy_score
        df["czas_wykonywania"] = df.execution_time
        dataframes.append(df[["dataset", "seed", "trafność", "F-miara", "czas_wykonywania", "y_predict", "y_true"]])
rocket_df = pd.concat(dataframes, axis=0, ignore_index=True)

## Rocket średnie miar  

In [None]:
rocket_means_df = rocket_df.groupby("dataset")[["trafność", "F-miara", "czas_wykonywania"]].mean().apply(lambda x: np.round(x, decimals=3))
rocket_means_df.head()

rocket_accuracy_plot = (
    ggplot(rocket_df)
    + geom_boxplot(aes(x='factor(dataset)', y='trafność'))
    + scale_x_discrete(labels=rocket_df.dataset.unique(), name='dataset')  +
    labs(title='Wykres wartości trafności ROCKET.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

rocket_fmeasure_plot = (
    ggplot(rocket_df)
    + geom_boxplot(aes(x='factor(dataset)', y='F-miara'))
    + scale_x_discrete(labels=rocket_df.dataset.unique(), name='dataset')  +
    labs(title='Wykres wartości F-miary ROCKET.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)
print(rocket_accuracy_plot)
rocket_accuracy_plot.save(f"{PLOTS}/{algorythm}/rocket_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(rocket_fmeasure_plot)
rocket_fmeasure_plot.save(f"{PLOTS}/{algorythm}/rocket_fmeasure_boxplot.pdf", dpi=600, verbose = False)
rocket_means_df.head()


## Rocket macierz pomyłek

In [None]:
testing_df = rocket_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].trafność - rocket_means_df[rocket_means_df.index == dataset].trafność[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek: algorytm ROCKET,\n" +f"zbiór danych {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation="vertical",
        )   
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")

# HIVECOTEV2

In [None]:
algorythm = "HIVE"
if not os.path.exists(f"{PLOTS}/{algorythm}"):
    os.makedirs(f"{PLOTS}/{algorythm}")
dataframes = []
for dataset in datasets:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df["F-miara"] = df.f1_measure
        df["trafność"] = df.accuracy_score
        df["czas_wykonywania"] = df.execution_time
        dataframes.append(df[["dataset", "seed", "trafność", "F-miara", "czas_wykonywania", "y_predict", "y_true"]])
hive_df = pd.concat(dataframes, axis=0, ignore_index=True)

## HIVECOTEV2 średnie miar  

In [None]:
hive_means_df = hive_df.groupby("dataset")[["trafność", "F-miara", "czas_wykonywania"]].mean().apply(lambda x: np.round(x, decimals=3))
hive_means_df.head()

hive_accuracy_plot = (
    ggplot(hive_df)
    + geom_boxplot(aes(x='factor(dataset)', y='trafność'))
    + scale_x_discrete(labels=hive_df.dataset.unique(), name='dataset') +
    labs(title='Wykres wartości trafności HIVECOTEV2.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

hive_fmeasure_plot = (
    ggplot(hive_df)
    + geom_boxplot(aes(x='factor(dataset)', y='F-miara'))
    + scale_x_discrete(labels=hive_df.dataset.unique(), name='dataset') +
    labs(title='Wykres wartości F-miary HIVECOTEV2.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)
print(hive_accuracy_plot)
hive_accuracy_plot.save(f"{PLOTS}/{algorythm}/hive_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(hive_fmeasure_plot)
hive_fmeasure_plot.save(f"{PLOTS}/{algorythm}/hive_fmeasure_boxplot.pdf", dpi=600, verbose = False)
hive_means_df.head()


## HIVECOTEV2 macierz pomyłek

In [None]:
testing_df = hive_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].trafność - hive_means_df[hive_means_df.index == dataset].trafność[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek: algorytm HIVECOTEV2,\n" +f"zbiór danych {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation="vertical",
        )
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if not os.path.exists(f"{PLOTS}/{algorythm}"):
            os.makedirs(f"{PLOTS}/{algorythm}")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")


# KNN-DTW

In [None]:
algorythm = "KNN"
if not os.path.exists(f"{PLOTS}/{algorythm}/size"):
    os.makedirs(f"{PLOTS}/{algorythm}/size")
dataframes = []
for dataset in datasets:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("\r", "").replace("'", "")[1:-1].split(" ")))
        df["F-miara"] = df.f1_measure
        df["trafność"] = df.accuracy_score
        df["czas_wykonywania"] = df.execution_time
        df["k"] = df["seed"]
        dataframes.append(df[["dataset", "k", "trafność", "F-miara", "czas_wykonywania", "y_predict", "y_true"]])
knn_df = pd.concat(dataframes, axis=0, ignore_index=True)
knn_df

## KNN-DTW średnie miar  

In [None]:
knn_means_df = knn_df.groupby("dataset")[["trafność", "F-miara", "czas_wykonywania"]].mean().apply(lambda x: np.round(x, decimals=3))
knn_means_df.head()

knn_accuracy_plot = (
    ggplot(knn_df)
    + geom_boxplot(aes(x='factor(dataset)', y='trafność'))
    + geom_point(aes(x='factor(dataset)', y='trafność', color="factor(k)"))
    + guides(color=guide_legend(title='k'))
    + scale_x_discrete(labels=knn_df.dataset.unique(), name='typ funkcji') +
    labs(title='Wykres wartości trafności KNN-DTW.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

knn_fmeasure_plot = (
    ggplot(knn_df)
    + geom_boxplot(aes(x='factor(dataset)', y='F-miara'))
    + geom_point(aes(x='factor(dataset)', y='F-miara', color="factor(k)"))
    + guides(color=guide_legend(title='k'))
    + scale_x_discrete(labels=knn_df.dataset.unique(), name='typ funkcji') 
    + labs(title='Wykres wartości F-miary KNN-DTW.\n Klasyfikacja typu funkcji.')
    + theme_classic()
    + theme(panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)
print(knn_accuracy_plot)
knn_accuracy_plot.save(f"{PLOTS}/{algorythm}/knn_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(knn_fmeasure_plot)
knn_fmeasure_plot.save(f"{PLOTS}/{algorythm}/knn_fmeasure_boxplot.pdf", dpi=600, verbose = False)
knn_means_df.head()

## KNN-DTW macierz pomyłek

In [None]:
testing_df = knn_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].trafność - knn_means_df[knn_means_df.index == dataset].trafność[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek: algorytm KNN-DTW,\n" +f"zbiór danych {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation = "vertical"
        )
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if not os.path.exists(f"{PLOTS}/{algorythm}"):
            os.makedirs(f"{PLOTS}/{algorythm}")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")

## Analiza porównawcza

In [None]:
import plotnine as plotnine
knn_means_df["algorytm"] = "KNN"
rocket_means_df["algorytm"] = "ROCKET"
hive_means_df["algorytm"] = "HIVECOTEV2"

dodge_text = position_dodge(width=0.4, preserve="single")                              # new
summary_df = pd.concat([knn_means_df, rocket_means_df, hive_means_df]).reset_index()
acc_comp_graph = (ggplot(summary_df, aes(x='algorytm', y='trafność', fill = 'factor(dataset)'))
    + guides(fill=guide_legend(title='dataset'), label_position = "top")
    + geom_col(stat='identity', position=position_dodge(0.94, "total"), size=1)
    + labs(title='Porównanie trafności między różnymi algorytmami')
    + coord_cartesian(ylim=(0.9, 1))
    + theme_classic()
    + theme(legend_position =(.5, -0.06), legend_direction='horizontal', legend_title_align="center", panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

f1_comp_graph = (ggplot(summary_df, aes(x='algorytm', y='F-miara', fill = 'factor(dataset)'))
    + guides(fill=guide_legend(title='dataset'), label_position = "top")
    + geom_col(stat='identity', position=position_dodge(0.94, "total"), size=1)
    + labs(title='Porównanie wartości F-miary między różnymi algorytmami')
    + coord_cartesian(ylim=(0.9, 1))
    + theme_classic()
    + theme(legend_position =(.5, -0.06), legend_direction='horizontal', legend_title_align="center", panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

time_df = summary_df.groupby("algorytm")["czas_wykonywania"].mean().reset_index()
time_df["czas_wykonywania"] = time_df["czas_wykonywania"].apply(lambda x : x/60)
time_comp_graph = (ggplot(time_df, aes(x='algorytm', y='czas_wykonywania'))
    + geom_col(stat='identity', position=position_dodge(0.94, "total"), size=1)
    + labs(title='Porównanie wartości czasu przetwarzania między różnymi algorytmami') 
    + ylab("czas przetwarzania(minuty)")
    + theme_classic()
    + theme(legend_position =(.5, -0.06), legend_direction='horizontal', legend_title_align="center", panel_grid=element_line(color="lightgrey"),
        panel_grid_major=element_line(size=1.4, alpha=1), 
        panel_grid_major_y=element_line(linetype='dashed'), 
        panel_grid_minor=element_line(alpha=.25),
        panel_grid_minor_y = element_line(linetype='dashed'))
)

print(acc_comp_graph)
acc_comp_graph.save(f"{PLOTS}/acc_comp_graph.pdf", dpi=600, verbose = False)
print(f1_comp_graph)
f1_comp_graph.save(f"{PLOTS}/f1_comp_graph.pdf", dpi=600, verbose = False)
print(time_comp_graph)
time_comp_graph.save(f"{PLOTS}/time_comp_graph.pdf", dpi=600, verbose = False)

In [None]:
pd.pivot_table(summary_df, values='trafność', index='dataset', columns='algorytm')

In [None]:
pd.pivot_table(summary_df, values='F-miara', index='dataset', columns='algorytm')

In [None]:
summary_df.groupby("algorytm")["czas_wykonywania"].mean().apply(lambda x : x/60).reset_index()

In [None]:
summary_df.groupby(["algorytm", "dataset"])["trafność"].mean()