In [36]:
import os.path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

In [37]:
save_folder = os.path.join(os.path.abspath(os.path.curdir), 'results')

In [70]:
def _do_test(test_name: str, first_series, second_series):
    result = scipy.stats.ttest_rel(first_series, second_series)
    print(f'{test_name}: {result.statistic} {result.pvalue}')
    
def _calculate_mean(name: str, data):
    print(f'{name} mean: {np.mean(data)}')
    
def _get_compression_coeffs(df):
    bxes_line = df['OriginalSize'].to_numpy() / df['BxesSize'].to_numpy()
    bxes_preprocessing = df['OriginalSize'].to_numpy() / df['BxesPreprocessing'].to_numpy()
    exi = df['OriginalSize'].to_numpy() / df['ExiSize'].to_numpy()
    
    return bxes_line, bxes_preprocessing, exi

def plot_compression_graphics(df, test_name, experiment_folder_path):
    x = [x for x in range(len(df['BxesSize'].to_numpy()))]
    
    bxes_line, bxes_preprocessing, exi = _get_compression_coeffs(df)
    fig, ax = plt.subplots()
    ax.plot(x, bxes_preprocessing, label='BxesPreprocessing')
    ax.plot(x, bxes_line, label='Bxes', linestyle='--')
    ax.plot(x, exi, label='Exi', linestyle=':')
    ax.legend()
    
    compression_coef_path = os.path.join(experiment_folder_path, f'{test_name}.png')
    fig.savefig(compression_coef_path)
    plt.close(fig)
    
    fig, ax = plt.subplots()
    ax.plot(x, df['OriginalSize'], label='OriginalSize')
    ax.plot(x, df['BxesToXesSize'], label='BxesToXesSize')
    ax.legend()
    
    file_size_path = os.path.join(experiment_folder_path, f'FileSize{test_name}.png')
    fig.savefig(file_size_path)
    plt.close(fig)

def print_test_results(df, test_name):
    bxes_line, bxes_preprocessing, exi = _get_compression_coeffs(df)

    print(f'TEST NAME: {test_name}')
    print('======================================')
    
    _do_test('Exi-Bxes', exi, bxes_line)
    _do_test('Exi-BxesPreprocessing', exi, bxes_preprocessing)
    _do_test('Bxes-BxesPreprocessing', bxes_line, bxes_preprocessing)
    
    _calculate_mean('Bxes', bxes_line)
    _calculate_mean('BxesPreprocessing', bxes_preprocessing)
    _calculate_mean('Exi', exi)
    
    print('======================================')
    print()

def create_sort_number(row):
    if not all_procfiler_logs_predicate(row):
        return 0
    
    if one_repeat_procfiler_logs_predicate(row):
        return 1
    
    if twenty_five_repeat_procfiler_logs_predicate(row):
        return 2
    
    if fifty_repeat_procfiler_logs_predicate(row):
        return 3
    
    if seventy_five_repeat_procfiler_logs_predicate(row):
        return 4

def plot_variants_coeff(df, test_name, experiment_folder_path):
    if not ('ValuesVariantsCoef' in df and 'AttributesVariantsCoef' in df):
        return

    x = [x for x in range(len(df['BxesSize'].to_numpy()))]
    
    sort_column = 'SortColumn'
    sort_column_values = [create_sort_number(row) for _, row in df.iterrows()]
    df.loc[:, sort_column] = sort_column_values
    df = df.sort_values(by=[sort_column])
    
    _, bxes_preprocessing, _ = _get_compression_coeffs(df)
    def scale(data):
        return (data - min(data)) / (max(data) - min(data))

    fig, ax = plt.subplots()
    ax.plot(x, scale(bxes_preprocessing), label='BxesPreprocessing')
    ax.plot(x, df['AttributesVariantsCoef'], label='AttributesVariantsCoeffs', linestyle='--')
    ax.legend(loc='upper right')
    variants_coef_path = os.path.join(experiment_folder_path, f'{test_name}_Variants.png')
    fig.savefig(variants_coef_path)
    plt.close(fig)

def test_hypothesis(experiment_name: str, test_name: str, df: pd.DataFrame):
    experiment_folder_path = os.path.join(save_folder, experiment_name)
    if not os.path.exists(experiment_folder_path):
        os.makedirs(experiment_folder_path, exist_ok=True)
        
    plot_compression_graphics(df, test_name, experiment_folder_path)
    print_test_results(df, test_name)
    plot_variants_coeff(df, test_name, experiment_folder_path)

def test_hypothesis_filtered_df(experiment_name, test_name: str, df, predicate):
    filter_result = df.apply(predicate, axis=1)
    df = df[filter_result]
    print(df['Name'].unique())
    test_hypothesis(experiment_name, test_name, df)

def all_procfiler_logs_predicate(row):
    return ('_1' in row['Name'] or '_25' in row['Name'] or '_50' in row['Name'] or '_75' in row['Name']) and 'BPI' not in row['Name']

def not_procfiler_log(row):
    return not all_procfiler_logs_predicate(row)

def one_repeat_procfiler_logs_predicate(row):
    return row['Name'].endswith('_1') and 'BPI' not in row['Name']

def twenty_five_repeat_procfiler_logs_predicate(row):
    return ('_25' in row['Name']) and 'BPI' not in row['Name']

def fifty_repeat_procfiler_logs_predicate(row):
    return ('_50' in row['Name']) and 'BPI' not in row['Name']

def seventy_five_repeat_procfiler_logs_predicate(row):
    return ('_75' in row['Name']) and 'BPI' not in row['Name']

def analyze_results_file(experiment_name: str, path: str):
    df = pd.read_csv(path, sep=';')
    df.head()
    
    test_hypothesis(experiment_name, 'AllLogs', df)
    test_hypothesis_filtered_df(experiment_name, 'NotProcfilerLogs', df, not_procfiler_log)
    test_hypothesis_filtered_df(experiment_name, 'ProcfilerLogs', df, all_procfiler_logs_predicate)
    test_hypothesis_filtered_df(experiment_name, 'ProcfilerLogs1', df, one_repeat_procfiler_logs_predicate)
    test_hypothesis_filtered_df(experiment_name, 'ProcfilerLogs25', df, twenty_five_repeat_procfiler_logs_predicate)
    test_hypothesis_filtered_df(experiment_name, 'ProcfilerLogs50', df, fifty_repeat_procfiler_logs_predicate)
    test_hypothesis_filtered_df(experiment_name, 'ProcfilerLogs75', df, seventy_five_repeat_procfiler_logs_predicate)

In [71]:
analyze_results_file('OldResults', 'results.csv')

TEST NAME: AllLogs
Exi-Bxes: -2.596364163440163 0.01082618740163833
Exi-BxesPreprocessing: -2.7926132251423224 0.006254805652805677
Bxes-BxesPreprocessing: -3.6934244567760683 0.00035925895222195905
Bxes mean: 53.7781774467135
BxesPreprocessing mean: 54.46259362049429
Exi mean: 47.36671226539508

['CCC19 - Log XES' 'JUnit 4.12 Software Event Log'
 'Sepsis Cases - Event Log' 'BPI Challenge 2018' 'Hospital_log'
 'BPI Challenge 2017' 'Hospital Billing - Event Log'
 'Road_Traffic_Fine_Management_Process' 'log2' 'log1'
 'Statechart Workbench and Alignments Software Event Log'
 'BPI_Challenge_2012' 'BPIC15_3' 'BPIC15_1' 'BPIC15_4' 'BPIC15_5'
 'activitylog_uci_detailed_labour' 'edited_hh104_labour'
 'edited_hh110_labour' 'edited_hh110_weekends' 'edited_hh102_weekends'
 'edited_hh104_weekends' 'edited_hh102_labour'
 'activitylog_uci_detailed_weekends' 'BPI_Challenge_2013_closed_problems'
 'BPI_Challenge_2013_incidents' 'BPI_Challenge_2013_open_problems'
 'nasa-cev-complete-splitted' 'nasa-cev-

In [72]:
analyze_results_file('NewResultsNoLifecycle', 'results_no_lifecycle.csv')

TEST NAME: AllLogs
Exi-Bxes: -1.5412546634866897 0.1263821844584904
Exi-BxesPreprocessing: -1.7450665372609482 0.08401480209523475
Bxes-BxesPreprocessing: -4.559008207996465 1.445169948523599e-05
Bxes mean: 51.179215192218805
BxesPreprocessing mean: 51.774749039354084
Exi mean: 47.36671226539508

['CCC19 - Log XES' 'JUnit 4.12 Software Event Log'
 'Sepsis Cases - Event Log' 'BPI Challenge 2018' 'Hospital_log'
 'BPI Challenge 2017' 'Hospital Billing - Event Log'
 'Road_Traffic_Fine_Management_Process' 'log2' 'log1'
 'Statechart Workbench and Alignments Software Event Log'
 'BPI_Challenge_2012' 'BPIC15_3' 'BPIC15_1' 'BPIC15_4' 'BPIC15_5'
 'activitylog_uci_detailed_labour' 'edited_hh104_labour'
 'edited_hh110_labour' 'edited_hh110_weekends' 'edited_hh102_weekends'
 'edited_hh104_weekends' 'edited_hh102_labour'
 'activitylog_uci_detailed_weekends' 'BPI_Challenge_2013_closed_problems'
 'BPI_Challenge_2013_incidents' 'BPI_Challenge_2013_open_problems'
 'nasa-cev-complete-splitted' 'nasa-cev-

In [73]:
analyze_results_file('NewResultsNoLifecycleVariantsCoeffs', 'results_with_coeffs.csv')

TEST NAME: AllLogs
Exi-Bxes: -1.5049051027477156 0.1355005369329433
Exi-BxesPreprocessing: -1.709993939644089 0.09036811502672942
Bxes-BxesPreprocessing: -4.577676192185515 1.3549260033325119e-05
Bxes mean: 51.31961881628168
BxesPreprocessing mean: 51.92256760138383
Exi mean: 47.56087795563652
['CCC19 - Log XES' 'JUnit 4.12 Software Event Log'
 'Sepsis Cases - Event Log' 'BPI Challenge 2018' 'Hospital_log'
 'BPI Challenge 2017' 'Hospital Billing - Event Log'
 'Road_Traffic_Fine_Management_Process' 'log2' 'log1'
 'Statechart Workbench and Alignments Software Event Log' 'BPIC15_3'
 'BPIC15_1' 'BPIC15_4' 'BPIC15_5' 'activitylog_uci_detailed_labour'
 'edited_hh104_labour' 'edited_hh110_labour' 'edited_hh110_weekends'
 'edited_hh102_weekends' 'edited_hh104_weekends' 'edited_hh102_labour'
 'activitylog_uci_detailed_weekends' 'BPI_Challenge_2013_closed_problems'
 'BPI_Challenge_2013_incidents' 'BPI_Challenge_2013_open_problems'
 'nasa-cev-complete-splitted' 'nasa-cev-1-10-single-trace'
 'nasa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values


TEST NAME: ProcfilerLogs1
Exi-Bxes: 17.295522833035484 3.1691468519977533e-12
Exi-BxesPreprocessing: 17.655674434975317 2.271885001363354e-12
Bxes-BxesPreprocessing: 1.5361558832880564 0.14290248319136722
Bxes mean: 27.230153401389572
BxesPreprocessing mean: 27.008007886977495
Exi mean: 33.14973328327912

['intensive_thread_pool_25' 'exception_try_catch_finally_when_25'
 'task_test_project_25' 'unsafe_fixed_25'
 'not_existing_assembly_loading_25' 'dynamic_assembly_loading_25'
 'not_simple_async_await_25' 'console_app_1_25' 'array_pooling_25'
 'finalizable_object_25' 'exception_try_catch_finally_async_25'
 'simple_async_await_25' 'exception_try_catch_finally_25'
 'dynamic_assembly_creation_25' 'file_write_project_25'
 'loh_allocations_25' 'file_async_operations_25' 'yield_enumerator_25']
TEST NAME: ProcfilerLogs25
Exi-Bxes: -1.334304756818112 0.19970294886510417
Exi-BxesPreprocessing: -1.4347828523438726 0.16949059027308838
Bxes-BxesPreprocessing: -2.3947190423145606 0.02842843842800746

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values


TEST NAME: ProcfilerLogs50
Exi-Bxes: -1.663273486540309 0.11458201855894744
Exi-BxesPreprocessing: -1.7824326260031913 0.0925463356063612
Bxes-BxesPreprocessing: -2.7489466561876164 0.013698210122337722
Bxes mean: 59.76491602271623
BxesPreprocessing mean: 60.947020928064426
Exi mean: 51.70467570788716

['array_pooling_75' 'finalizable_object_75' 'not_simple_async_await_75'
 'console_app_1_75' 'yield_enumerator_75' 'simple_async_await_75'
 'exception_try_catch_finally_async_75' 'exception_try_catch_finally_75'
 'file_write_project_75' 'dynamic_assembly_creation_75'
 'file_async_operations_75' 'loh_allocations_75'
 'exception_try_catch_finally_when_75' 'intensive_thread_pool_75'
 'dynamic_assembly_loading_75' 'task_test_project_75' 'unsafe_fixed_75'
 'not_existing_assembly_loading_75']
TEST NAME: ProcfilerLogs75
Exi-Bxes: -2.95541716057572 0.008856808897246528
Exi-BxesPreprocessing: -3.456598925424298 0.003014695121962886
Bxes-BxesPreprocessing: -3.201617536745868 0.005229324268928407
Bx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, sort_column] = sort_column_values
