# Zanalizowanie i wizualizacja wyników eksperymentów badających podobieństwo między wolumenami danych w ramach tych samych typów UDF

Każdy z algorytmów został poddany takiej samej analizie. Wpierw liczona jest średnia dla każdego z typów algorytmów. Następnie tworzony jest boxplot sprawdzający wpływ wartości losowej na wyniki. Na końcu liczona jest macierz pomyłek dla wyników, który jest najbardziej zbliżony do wyniku średniego.

Dodatkowo, na końcu wykonana jest analiza porównawcza każdego z badanych algorytmów.
### Deklaracja zmiennych użytkowych

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from plotnine import *

RESULTS_PATH = "./../MachineLearning/results/size"
PLOTS = "./../Plots/MachineLearning"
datasets_path = ["aggregation", "filtration", "filtration-aggregation", "filtration-aggregation-join", "filtration-join"]
datasets = ["aggregation", "filtering", "filtering-aggregation", "filtering-aggregation-join", "filtering-join"]
name_dict = {"aggregation" : "aggregation", "filtration" : "filtering", "filtration-aggregation" : "filtering-aggregation", "filtration-aggregation-join" : "filtering-aggregation-join", "filtration-join" : "filtering-join"}
class_names = ["1GB", "2GB"]

# ROCKET

In [None]:
algorythm = "Rocket"
if not os.path.exists(f"{PLOTS}/{algorythm}/size"):
    os.makedirs(f"{PLOTS}/{algorythm}/size")
dataframes = []
for dataset in datasets_path:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df.dataset = df.dataset.apply(lambda x: name_dict[x])
        dataframes.append(df[["dataset", "seed", "accuracy_score", "f1_measure", "execution_time", "y_predict", "y_true"]])
rocket_df = pd.concat(dataframes, axis=0, ignore_index=True)

## Rocket średnie miar  

In [None]:
rocket_means_df = rocket_df.groupby("dataset")[["accuracy_score", "f1_measure", "execution_time"]].mean().apply(lambda x: np.round(x, decimals=3))
rocket_means_df.head()

rocket_accuracy_plot = (
    ggplot(rocket_df)
    + geom_boxplot(aes(x='factor(dataset)', y='accuracy_score'))
    + scale_x_discrete(labels=rocket_df.dataset.unique(), name='typ funkcji')+
    theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości trafności ROCKET.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)

rocket_fmeasure_plot = (
    ggplot(rocket_df)
    + geom_boxplot(aes(x='factor(dataset)', y='f1_measure'))
    + scale_x_discrete(labels=rocket_df.dataset.unique(), name='typ funkcji')+
    theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości F-miary ROCKET.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)
print(rocket_accuracy_plot)
rocket_accuracy_plot.save(f"{PLOTS}/{algorythm}/size/rocket_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(rocket_fmeasure_plot)
rocket_fmeasure_plot.save(f"{PLOTS}/{algorythm}/size/rocket_fmeasure_boxplot.pdf", dpi=600, verbose = False)
rocket_means_df.head()


## Rcoket macierz pomyłek

In [None]:
testing_df = rocket_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].accuracy_score - rocket_means_df[rocket_means_df.index == dataset].accuracy_score[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek:,\n"+"algorytm Rocket,\n" +f"zbiór danych {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation="vertical",
        )
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/size/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")


# HIVECOTEV2

In [None]:
algorythm = "HIVE"
if not os.path.exists(f"{PLOTS}/{algorythm}/size"):
    os.makedirs(f"{PLOTS}/{algorythm}/size")
dataframes = []
for dataset in datasets_path:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df.dataset = df.dataset.apply(lambda x: name_dict[x])
        dataframes.append(df[["dataset", "seed", "accuracy_score", "f1_measure", "execution_time", "y_predict", "y_true"]])
hive_df = pd.concat(dataframes, axis=0, ignore_index=True)

## HIVECOTEV2 średnie miar  

In [None]:
hive_means_df = hive_df.groupby("dataset")[["accuracy_score", "f1_measure", "execution_time"]].mean().apply(lambda x: np.round(x, decimals=3))
hive_means_df.head()

hive_accuracy_plot = (
    ggplot(hive_df)
    + geom_boxplot(aes(x='factor(dataset)', y='accuracy_score'))
    + scale_x_discrete(labels=hive_df.dataset.unique(), name='typ funkcji') +
    theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości trafności HIVECOTEV2.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)

hive_fmeasure_plot = (
    ggplot(hive_df)
    + geom_boxplot(aes(x='factor(dataset)', y='f1_measure'))
    + scale_x_discrete(labels=hive_df.dataset.unique(), name='typ funkcji') +
    theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości F-miary HIVECOTEV2.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)
print(hive_accuracy_plot)
hive_accuracy_plot.save(f"{PLOTS}/{algorythm}/size/hive_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(hive_fmeasure_plot)
hive_fmeasure_plot.save(f"{PLOTS}/{algorythm}/size/hive_fmeasure_boxplot.pdf", dpi=600, verbose = False)
hive_means_df.head()


## HIVECOTEV2 macierz pomyłek

In [None]:
testing_df = hive_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].accuracy_score - hive_means_df[hive_means_df.index == dataset].accuracy_score[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek:,\n"+"algorytm HIVECOTEV2,\n" +f"funkcja {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize
        )
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if not os.path.exists(f"{PLOTS}/{algorythm}"):
            os.makedirs(f"{PLOTS}/{algorythm}")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/size/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")


# KNN-DTW

In [None]:
algorythm = "KNN"
if not os.path.exists(f"{PLOTS}/{algorythm}/size"):
    os.makedirs(f"{PLOTS}/{algorythm}/size")
dataframes = []
for dataset in datasets_path:
    path_to_files = f"{RESULTS_PATH}/{dataset}/{algorythm}"
    file_names = os.listdir(f"{RESULTS_PATH}/{dataset}/{algorythm}")
    for file_name in file_names:
        df = pd.read_csv(f"{path_to_files}/{file_name}", index_col=None, header=0)
        df.y_predict = df.y_predict.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df.y_true = df.y_true.apply(lambda x: np.array(x.replace("\n", "").replace("'", "")[1:-1].split(" ")))
        df["k"] = df["seed"]
        df.dataset = df.dataset.apply(lambda x: name_dict[x])
        dataframes.append(df[["dataset", "k", "accuracy_score", "f1_measure", "execution_time", "y_predict", "y_true"]])
knn_df = pd.concat(dataframes, axis=0, ignore_index=True)


## KNN-DTW średnie miar  

In [None]:
knn_means_df = knn_df.groupby("dataset")[["accuracy_score", "f1_measure", "execution_time"]].mean().apply(lambda x: np.round(x, decimals=3))
knn_means_df.head()

knn_accuracy_plot = (
    ggplot(knn_df)
    + geom_boxplot(aes(x='factor(dataset)', y='accuracy_score'))
    + geom_point(aes(x='factor(dataset)', y='f1_measure', color="factor(k)"))
    + guides(color=guide_legend(title='k'))
    + scale_x_discrete(labels=knn_df.dataset.unique(), name='typ funkcji') +
    theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości trafności KNN-DTW.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)

knn_fmeasure_plot = (
    ggplot(knn_df)
    + geom_boxplot(aes(x='factor(dataset)', y='f1_measure'))
    + geom_point(aes(x='factor(dataset)', y='f1_measure', color="factor(k)"))
    + guides(color=guide_legend(title='k'))
    + scale_x_discrete(labels=knn_df.dataset.unique(), name='typ funkcji') 
    + theme(axis_text_x=element_text(rotation=90, hjust=0.4)) +
    labs(title='Wykreś wartości F-miary KNN-DTW.\n Klasyfikacja rozmiaru wolumenu przetwarzanych danych.')
)
print(knn_accuracy_plot)
knn_accuracy_plot.save(f"{PLOTS}/{algorythm}/size/knn_accuracy_boxplot.pdf", dpi=600, verbose = False)
print(knn_fmeasure_plot)
knn_fmeasure_plot.save(f"{PLOTS}/{algorythm}/size/knn_fmeasure_boxplot.pdf", dpi=600, verbose = False)
knn_means_df.head()

## KNN-DTW macierz pomyłek

In [None]:
testing_df = knn_df.copy()
result = {}
for dataset in datasets:    
    testing_df["closest_acc"] = np.abs(testing_df[testing_df.dataset == dataset].accuracy_score - knn_means_df[knn_means_df.index == dataset].accuracy_score[0])
    local_min = testing_df[testing_df.dataset == dataset].closest_acc.min()
    results_df = testing_df.loc[testing_df.closest_acc == local_min].head(1).copy()
    result[dataset] = results_df
    titles_options = [
        ("Macierz pomyłek:,\n"+"algorytm KNN-DTW,\n" +f"funkcja {dataset}", None)
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_predictions(
            results_df.y_true.iloc[0],
            results_df.y_predict.iloc[0],
            display_labels=class_names,
            cmap=plt.cm.Blues,
            normalize=normalize
        )
        disp.ax_.set_title(title)
        disp.ax_.set_ylabel("Faktyczna klasa")
        disp.ax_.set_xlabel("Przewidzana klasa")
        if not os.path.exists(f"{PLOTS}/{algorythm}"):
            os.makedirs(f"{PLOTS}/{algorythm}")
        if normalize:
            normalized = "_normalized"
        else:
            normalized = ""
        disp.figure_.savefig(f"{PLOTS}/{algorythm}/size/{dataset}{normalized}_confusion_matrix.pdf", dpi=600, bbox_inches = "tight")

## Analiza porównawcza

In [None]:
knn_means_df["algorytm"] = "KNN"
rocket_means_df["algorytm"] = "ROCKET"
hive_means_df["algorytm"] = "HIVECOTEV2"

dodge_text = position_dodge(width=0.4, preserve="single")                              # new
summary_df = pd.concat([knn_means_df, rocket_means_df, hive_means_df]).reset_index()
acc_comp_graph = (ggplot(summary_df, aes(x='algorytm', y='accuracy_score', color = 'factor(dataset)'))
    + guides(color=guide_legend(title='dataset'), fill='factor(dataset)')
    + geom_point(size = 3, position=dodge_text)
    + lims(y=(0.25, 1))  
    + labs(title='Porównanie trafności między róznymi algorytmami')    
)

f1_comp_graph = (ggplot(summary_df, aes(x='algorytm', y='f1_measure', color = 'factor(dataset)'))
    + guides(color=guide_legend(title='dataset'), fill='factor(dataset)')
    + geom_point(size = 3, position=dodge_text)
    + lims(y=(0.25, 1))  
    + labs(title='Porównanie wartości F-miary między róznymi algorytmami')    
)

time_df = summary_df.groupby("algorytm")["execution_time"].mean().reset_index()
time_df["execution_time"] = time_df["execution_time"].apply(lambda x : x/60)
time_comp_graph = (ggplot(time_df, aes(x='algorytm', y='execution_time'))
    + geom_point()
    + labs(title='Porównanie wartości czasu przetwarzania między róznymi algorytmami') 
    + ylab("czas przetwarzania(minuty)")
)

print(acc_comp_graph)
acc_comp_graph.save(f"{PLOTS}/size_acc_comp_graph.pdf", dpi=600, verbose = False)
print(f1_comp_graph)
f1_comp_graph.save(f"{PLOTS}/size_f1_comp_graph.pdf", dpi=600, verbose = False)
print(time_comp_graph)
time_comp_graph.save(f"{PLOTS}/size_time_comp_graph.pdf", dpi=600, verbose = False)

In [None]:
pd.pivot_table(summary_df, values='accuracy_score', index='dataset', columns='algorytm')

In [None]:
pd.pivot_table(summary_df, values='f1_measure', index='dataset', columns='algorytm')

In [None]:
summary_df.groupby("algorytm")["execution_time"].mean().apply(lambda x : x/60).reset_index()