## Preprocessing


Vengono letti tutti i file csv contenenti le statistiche delle run e viene creato un unico dataframe con i csv concatenati aggiungendo una colonna 'lambda' con i valori del tasso di arrivo. Dalla colonna 'statistic' ne vengono estratte tre: node contenente il nodo di riferimento della statistica o sysem se è globale, metric indicante la metrica misurata ed infine stat specifica se il valore misurato è media, minimo, massimo o deviazione standard.

Infine viene creata una lista di dataframe uno per ogni metrica e nodo prendendo come statistica solo la media.

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy

path = '..\..\src\caballo\domestico\wwsimulator\statistics'

filenames = os.listdir(path)
stats_df_list = []
for i in filenames:
    lambda_val = i.split('=')[1].split('_')[0]
    stats_df = pd.read_csv(f'{path}\{i}')
    stats_df['lambda'] = lambda_val
    stats_df_list.append(stats_df)
stats_df = pd.concat(stats_df_list)

stats_df['node'] = stats_df['statistic'].apply(lambda x: x.split('-')[0])
stats_df['metric'] = stats_df['statistic'].apply(lambda x: x.split('-')[1])
stats_df['stat'] = stats_df['statistic'].apply(lambda x: x.split('-')[2])
stats_df.drop(columns=['statistic'], inplace=True)

# lista di dataframe uno per ogni metrica selezionando solo la media come statistica
metrics = stats_df['metric'].unique()
metrics_avg_df_list = []
for i in metrics:
    metrics_avg_df = stats_df[stats_df['metric'] == i]
    metrics_avg_df = metrics_avg_df[metrics_avg_df['stat'] == 'avg']
    metrics_avg_df_list.append(metrics_avg_df)

metrics_nodes_avg_df_list = []
for i in metrics_avg_df_list:
    nodes = i['node'].unique()
    node_avg_df_list = []
    for j in nodes:
        node_avg_df = i[i['node'] == j]
        node_avg_df_list.append(node_avg_df)
    metrics_nodes_avg_df_list.append(node_avg_df_list)

Boxplots of the average of population number, throughput and response time with arrival rate in a range 0.5, 1.2 job/sec

In [None]:
for metric in metrics_nodes_avg_df_list:
    for boxplot_df in metric:
        boxplot_df['value'].astype(float)
        boxplot_df['lambda'].astype(float)
        boxplot_df = boxplot_df.sort_values(by='lambda')

        curr_metric = boxplot_df['metric'].unique()[0]
        curr_node = boxplot_df['node'].unique()[0]

        boxplot_df.boxplot(column='value', by='lambda')
        plt.suptitle('')
        plt.title(f'Boxplot of {curr_metric} of the {curr_node} node')
        plt.show()

Confidence interval

In [74]:
critical_value = scipy.stats.norm.ppf(1-.05/2)
cv_df = []
for metric in metrics_nodes_avg_df_list:
    for node_df in metric:
        curr_metric = node_df['metric'].unique()[0]
        curr_node = node_df['node'].unique()[0]
        
        df = node_df.groupby(["lambda"])['value'].describe()[["count", "mean", "std"]].reset_index()
        df["lower_ci"] = df["mean"] - critical_value*(df["std"]/np.sqrt(df["count"]))
        df["upper_ci"] = df["mean"] + critical_value*(df["std"]/np.sqrt(df["count"]))

        df.to_csv(f"output\conf_int_{curr_node}_{curr_metric}.csv", index=False)
        df['metric'] = curr_metric
        df['node'] = curr_node
        cv_df.append(df)

In [None]:
for df in cv_df:
    plt.figure(figsize=(10, 6))

    # Plot delle medie con le barre di errore
    plt.errorbar(df['lambda'], df['mean'], 
                yerr=[df['mean'] - df['lower_ci'], df['upper_ci'] - df['mean']], 
                fmt='o', capsize=5, label='Confidence interval 95%')

    # Plot della linea che congiunge le medie
    plt.plot(df['lambda'], df['mean'], linestyle='-', marker='o', color='b', label='Avg')

    plt.xlabel('Lambda')
    plt.ylabel(f"Avg {df['metric'].unique()[0]} in {df['node'].unique()[0]}")
    plt.title(f"Confidence interval for {df['node'].unique()[0]} average {df['metric'].unique()[0]}")
    plt.legend()

    plt.show()
