## Notebook to evaluate model performances on AML test and CYBER datasets

In [1]:
import pandas as pd
import os
import json

def load_data(directory_path):
    data = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
            model_name = str(os.path.splitext(filename)[0]).split("_")[0]
            if model_name != "perfectblocks": # do not include perfectblocks (values for what a perfect blocks model would provide as text segments)
                json_data['model'] = model_name
                data.append(json_data)
    return pd.DataFrame(data)

In [2]:

directory_path = 'AML'

data = []
aml_df  = load_data('AML')
display(aml_df)

# Set 'file_name' column as index of the DataFrame
aml_df.set_index('model', inplace=True)

# Sort values by 'rouge-1-f' in ascending order
aml_df.sort_values("rouge-1-f", inplace=True)

aml_df ['time/page'] = aml_df ['inference_time'] / aml_df ['pages'] # calculate time per page

# Define desired order of columns
desired_order = ['rouge-1-f', 'rouge-2-f', 'rouge-l-f'
                 , 'jaccard'
                 #, 'bleu'
                 , 'f1', 'precision', 'recall'
                 #, 'f1_region_lvl', 'precision_region_lvl', 'recall_region_lvl'
                 , 'time/page'
                 # , 'batch_size'
]

# Add other columns not mentioned in desired order
#remaining_columns = [col for col in df.columns if col not in desired_order]
#final_order = desired_order + remaining_columns

# Reorder columns based on specified order
aml_df  = aml_df[desired_order]

# Reset index for display purposes
aml_df.reset_index(inplace=True)

# convert time/page to milliseconds
aml_df['time/page'] = aml_df['time/page'] * 1000

# round all numbers apart from batch_size to 2 decimal places
aml_df = aml_df.round({'jaccard': 2
                       #, 'bleu': 2
                       , 'rouge-1-f': 2, 'rouge-2-f': 2, 'rouge-l-f': 2
                       , 'f1': 2, 'precision': 2, 'recall': 2
                       , 'f1_region_lvl': 2, 'precision_region_lvl': 2, 'recall_region_lvl': 2
                       , 'time/page': 2})

#rename columns
aml_df.columns = ['Model', '\\makecell[c]{ROUGE-\\\\1-F} $\\uparrow$', '\\makecell[c]{ROUGE-\\\\2-F}', '\\makecell[c]{ROUGE-\\\\L-F}'
                , 'Jaccard'
                #, 'BLEU'
              , 'F1', 'P', 'R'
              #, '\\makecell[c]{F1\\\\Avg}', '\\makecell[c]{P\\\\Avg}', '\\makecell[c]{R\\\\Avg}'
              , '\\makecell[c]{Time/Page\\\\(ms)**}'
              # , '\\makecell[c]{Batch\\\\Size}'
]

# rename models
aml_df['Model'] = aml_df['Model'].replace({
    'sentence': 'Sentence*',
    'ensemble': 'Ensemble*',
    'graphseg': 'GraphSeg (baseline)',
    'snippets': 'Snippeting alg. (baseline)',
    # 'token-roberta-large': '\\makecell[l]{Token-large\\\\main}',
    'token-roberta-large': 'Token-large-main',
    'blocks': 'Blocks',
    # 'token-roberta-base-main+auxiliary': '\\makecell[l]{Token-base\\\\main+\\\\auxiliary}',
    'token-roberta-base-main+auxiliary': 'Token-base-main+auxil.',
    # 'token-distilbert-distilroberta-base': '\\makecell[l]{Token-distil\\\\roberta-main}',
    'token-distilbert-distilroberta-base': 'Token-distilroberta-main',
    # 'token-roberta-base-main': '\\makecell[l]{Token-base\\\\main}',
    'token-roberta-base-main': 'Token-base-main',
    # 'token-roberta-base-pk': '\\makecell[l]{Token-base\\\\pk-main}'
    'token-roberta-base-pk': '\\textbf{Token-base-pk-main}'
})

# Display the DataFrame
aml_df


Unnamed: 0,precision,recall,f1,iou,bleu,jaccard,edit_distance,precision_region_lvl,recall_region_lvl,f1_region_lvl,rouge-1-f,rouge-2-f,rouge-l-f,inference_time,pages,batch_size,GPU,model
0,0.642006,0.810165,0.716349,0.743642,0.567952,0.743642,156.905502,0.654888,0.868526,0.679395,0.838327,0.81045,0.836951,57.294454,6918,2048,NVIDIA A100-SXM4-80GB,token-distilbert-distilroberta-base
1,0.615411,0.752864,0.677234,0.604931,0.515144,0.605104,138.739806,0.701286,0.795111,0.660876,0.721523,0.667386,0.717581,129.264382,6908,All sentences of a page,NVIDIA A100-SXM4-80GB,sentence
2,0.738713,0.827858,0.780749,0.689435,0.629151,0.689435,112.391626,0.766011,0.805166,0.740977,0.746527,0.696065,0.737098,146.039171,6930,128,NVIDIA A100-SXM4-80GB,blocks
3,0.481963,0.835195,0.611215,0.532669,0.438624,0.532669,269.543693,0.528272,0.917114,0.581565,0.657213,0.60877,0.656479,172.698841,6930,,,snippets
4,0.639346,0.818844,0.718048,0.74379,0.571028,0.74379,157.835387,0.654033,0.875511,0.681228,0.839077,0.812058,0.83784,117.747825,6918,512,NVIDIA A100-SXM4-80GB,token-roberta-base-main
5,0.653098,0.842896,0.735957,0.768594,0.593938,0.768594,148.652516,0.663128,0.89226,0.698077,0.858808,0.833437,0.857426,120.929541,6918,2048,NVIDIA A100-SXM4-80GB,token-roberta-base-pk
6,0.580463,0.839481,0.686348,0.674499,0.533663,0.674499,191.484292,0.60425,0.891717,0.64826,0.778105,0.741852,0.776819,389.187058,6918,512,NVIDIA A100-SXM4-80GB,token-roberta-large
7,0.631278,0.79145,0.702348,0.720181,0.551352,0.720181,163.914251,0.649292,0.844965,0.664969,0.751412,0.711771,0.750158,141.653276,6918,2048,NVIDIA A100-SXM4-80GB,token-roberta-base-main+auxiliary
8,0.641085,0.898946,0.748428,0.695376,0.639214,0.695548,115.984209,0.712512,0.910908,0.739607,0.788015,0.749091,0.786068,604.189537,6908,All sentences of a page,NVIDIA A100-SXM4-80GB,ensemble
9,0.515147,0.793165,0.624616,0.511998,0.4268,0.511998,241.353265,0.547615,0.840762,0.58562,0.631811,0.569733,0.625917,945.365678,6930,128,NVIDIA A100-SXM4-80GB,graphseg


Unnamed: 0,Model,\makecell[c]{ROUGE-\\1-F} $\uparrow$,\makecell[c]{ROUGE-\\2-F},\makecell[c]{ROUGE-\\L-F},Jaccard,F1,P,R,\makecell[c]{Time/Page\\(ms)**}
0,GraphSeg (baseline),0.63,0.57,0.63,0.51,0.62,0.52,0.79,136.42
1,Snippeting alg. (baseline),0.66,0.61,0.66,0.53,0.61,0.48,0.84,24.92
2,Sentence*,0.72,0.67,0.72,0.61,0.68,0.62,0.75,18.71
3,Blocks,0.75,0.7,0.74,0.69,0.78,0.74,0.83,21.07
4,Token-base-main+auxil.,0.75,0.71,0.75,0.72,0.7,0.63,0.79,20.48
5,Token-large-main,0.78,0.74,0.78,0.67,0.69,0.58,0.84,56.26
6,Ensemble*,0.79,0.75,0.79,0.7,0.75,0.64,0.9,87.46
7,Token-distilroberta-main,0.84,0.81,0.84,0.74,0.72,0.64,0.81,8.28
8,Token-base-main,0.84,0.81,0.84,0.74,0.72,0.64,0.82,17.02
9,\textbf{Token-base-pk-main},0.86,0.83,0.86,0.77,0.74,0.65,0.84,17.48


In [3]:
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import pandas as pd

def apply_heatmap(df, colormap='RdYlGn', alpha=0.6):
    '''
    We use this fucntion to turn our latex table into a heatmap
    '''
    def colorize(value, norm, scalar_map, alpha, bold_best):
        '''
        Helper function to colourise cells of table
        '''
        rgba = scalar_map.to_rgba(value)
        rgb = [alpha * c + (1 - alpha) * 1 for c in rgba[:3]]  # Mix with white color
        color = mcolors.to_hex(rgb)
        cell_format = f"\\cellcolor[HTML]{{{color[1:]}}}"
        if bold_best and value == best_value[col]:
            return f"{cell_format}\\textbf{{{value:0.2f}}}"
        else:
            return f"{cell_format}{value:0.2f}"
    
    df_colored = df.copy()
    best_value = {}
    # identify best value for each column
    for col in df_colored.columns:
        if pd.api.types.is_numeric_dtype(df_colored[col]):
            if col == '\\makecell[c]{Time/Page\\\\(ms)**}':
                best_value[col] = df_colored[col].min()
            else:
                best_value[col] = df_colored[col].max()
    
    # Apply heatmap to each cell
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            norm = mcolors.Normalize(vmin=df[col].min(), vmax=df[col].max())
            if col == '\\makecell[c]{Time/Page\\\\(ms)**}':  # Inverse colormap for this specific column
                cmap = cm.get_cmap(colormap).reversed()
            else:
                cmap = cm.get_cmap(colormap)
            scalar_map = cm.ScalarMappable(norm=norm, cmap=cmap)
            bold_best = (col in best_value)
            df_colored[col] = df[col].apply(lambda x: colorize(x, norm, scalar_map, alpha, bold_best))
    
    return df_colored

# Aplly heatmap to the dataframe
aml_df_colored = apply_heatmap(aml_df.iloc[:, 1:], alpha=0.7)

# Concatenate 'Model' column back
aml_df_colored = pd.concat([aml_df['Model'], aml_df_colored], axis=1)
column_format = 'l' + 'c' * (aml_df_colored.shape[1] - 1)

latex_table = aml_df_colored.to_latex(index=False,
                                      caption="Comparison of the different snippet identifier systems' performance on the \\gls{aml} test data.",
                                      label='tab:aml_test_results',
                                      column_format=column_format,
                                      longtable=False,
                                      escape=False,
                                      multicolumn=True,
                                      multicolumn_format='c',
                                      float_format="{:0.2f}".format,
                                      bold_rows=True)

# Integrate \footnotesize and reduce column separation
latex_table = latex_table.replace('\\begin{table}', '\\begin{table}[htbp!]\n\\footnotesize\n\\setlength{\\tabcolsep}{4pt}')
latex_table = latex_table.replace('\\bottomrule'
                                  , f'\\bottomrule\n\\multicolumn{{{aml_df_colored.shape[1]}}}{{l}}{{\\footnotesize *evaluated on unformatted text}} \\\\ \n \\multicolumn{{{aml_df_colored.shape[1]}}}{{l}}{{\\makecell[l]{{\\footnotesize **Batch sizes used during inference: 128 (Blocks, GraphSeg), 512 (Token-large-main),\\\\2048 (other token-level models), all sentences of a page (Sentence, Ensemble)}}}}')
print(latex_table)


  cmap = cm.get_cmap(colormap)
  cmap = cm.get_cmap(colormap).reversed()


\begin{table}[htbp!]
\footnotesize
\setlength{\tabcolsep}{4pt}
\caption{Comparison of the different snippet identifier systems' performance on the \gls{aml} test data.}
\label{tab:aml_test_results}
\begin{tabular}{lcccccccc}
\toprule
Model & \makecell[c]{ROUGE-\\1-F} $\uparrow$ & \makecell[c]{ROUGE-\\2-F} & \makecell[c]{ROUGE-\\L-F} & Jaccard & F1 & P & R & \makecell[c]{Time/Page\\(ms)**} \\
\midrule
GraphSeg (baseline) & \cellcolor[HTML]{c04d67}0.63 & \cellcolor[HTML]{c04d67}0.57 & \cellcolor[HTML]{c04d67}0.63 & \cellcolor[HTML]{c04d67}0.51 & \cellcolor[HTML]{d56068}0.62 & \cellcolor[HTML]{ee8572}0.52 & \cellcolor[HTML]{fcb789}0.79 & \cellcolor[HTML]{c04d67}136.42 \\
Snippeting alg. (baseline) & \cellcolor[HTML]{e97b6e}0.66 & \cellcolor[HTML]{ee8572}0.61 & \cellcolor[HTML]{e97b6e}0.66 & \cellcolor[HTML]{da6668}0.53 & \cellcolor[HTML]{c04d67}0.61 & \cellcolor[HTML]{c04d67}0.48 & \cellcolor[HTML]{e4f4ae}0.84 & \cellcolor[HTML]{6ebf88}24.92 \\
Sentence* & \cellcolor[HTML]{fee7ab}0.72 & \

### Evaluate CYBER I AND II

In [4]:
# Load AML data
aml_directory_path = 'AML'
aml_df = None
aml_df = load_data(aml_directory_path)

# Load CYBER_I data
cyber_df = None
cyber_directory_path = 'CYBER_I' # 'CYBER_I' or 'CYBER_II 
cyber_df = load_data(cyber_directory_path)

# Merge 'rouge-l-f' and 'rouge' columns into 'rouge-l-f' for cyber_df
if 'rouge' in cyber_df.columns and 'rouge-l-f' in cyber_df.columns:
    cyber_df['rouge-l-f'] = cyber_df['rouge-l-f'].combine_first(cyber_df['rouge'])
if 'rouge' in cyber_df.columns:
    cyber_df.drop(columns=['rouge'], inplace=True)

# Set 'model' column as index of both DataFrames
aml_df.set_index('model', inplace=True)
cyber_df.set_index('model', inplace=True)

common_models = aml_df.index.intersection(cyber_df.index)
aml_df = aml_df.loc[common_models]

# Define desired order of columns for CYBER_I
desired_order = ['rouge-l-f', 'jaccard', 'f1', 'precision', 'recall']

# Reorder columns based on the specified order
aml_df = aml_df[desired_order]
cyber_df = cyber_df[desired_order]

# Calculate percentage differences and combine with highlighted values
comparison_df = pd.DataFrame(index=cyber_df.index)
for column in desired_order:
    def combine_values(aml, cyber):
        aml_value = float(aml)
        percentage_diff = (cyber - aml_value) / aml_value * 100
        if percentage_diff >= 0:
            percentage_diff_str = f"\\textcolor{{mildgreen}}{{{percentage_diff:+.0f}\\%}}"
        else:
            percentage_diff_str = f"\\textcolor{{mildred}}{{{percentage_diff:+.0f}\\%}}"
        return f"{cyber:.2f} ({percentage_diff_str})"

    comparison_df[column] = aml_df[column].combine(cyber_df[column], combine_values)


comparison_df.sort_values("rouge-l-f", inplace=True)

# Reset index for display purposes
comparison_df.reset_index(inplace=True)

# Rename columns for LaTeX table
comparison_df.columns = ['Model', 'Rouge-L-F $\\uparrow$', 'Jaccard', 'F1', 'Precision', 'Recall']

# Rename models
comparison_df['Model'] = comparison_df['Model'].replace({
    'sentence': 'Sentence',
    'ensemble': 'Ensemble',
    'graphseg': 'GraphSeg',
    'snippets': 'Snippets',
    'token-roberta-large': 'Token-large-main',
    'blocks': 'Blocks',
    'token-roberta-base-main+auxiliary': 'Token-base-main+auxiliary',
    'token-distilbert-distilroberta-base': 'Token-distilroberta-main',
    'token-roberta-base-main': 'Token-base-main',
    'token-roberta-base-pk': '\\textbf{Token-base-pk-main}'
})

def highlight_best_values(df):
    '''
    Helper function to highlight the best values in each column of the DataFrame.
    '''
    for col in df.columns[1:]:  # Skipping 'Model' column
        # Extract numerical part for comparison
        numerical_values = df[col].str.extract(r'([-+]?[0-9]*\.?[0-9]+)').astype(float)
        # Find maximum value in the column
        max_value = numerical_values.max().values[0]
        # Apply LaTeX formatting for best value
        df[col] = df[col].apply(
            lambda x: x.replace(f'{max_value:.2f}', f'\\textbf{{{max_value:.2f}}}')
        )
    return df

highlighted_comparison_df = highlight_best_values(comparison_df.copy())
highlighted_comparison_df

Unnamed: 0,Model,Rouge-L-F $\uparrow$,Jaccard,F1,Precision,Recall
0,GraphSeg,0.61 (\textcolor{mildred}{-3\%}),0.50 (\textcolor{mildred}{-2\%}),0.63 (\textcolor{mildgreen}{+0\%}),0.54 (\textcolor{mildgreen}{+5\%}),0.74 (\textcolor{mildred}{-7\%})
1,Snippets,0.64 (\textcolor{mildred}{-2\%}),0.52 (\textcolor{mildred}{-3\%}),0.60 (\textcolor{mildred}{-3\%}),0.46 (\textcolor{mildred}{-4\%}),0.83 (\textcolor{mildred}{-1\%})
2,Blocks,0.77 (\textcolor{mildgreen}{+5\%}),0.73 (\textcolor{mildgreen}{+5\%}),\textbf{0.80} (\textcolor{mildgreen}{+3\%}),\textbf{0.75} (\textcolor{mildgreen}{+2\%}),\textbf{0.86} (\textcolor{mildgreen}{+4\%})
3,\textbf{Token-base-pk-main},\textbf{0.83} (\textcolor{mildred}{-3\%}),\textbf{0.74} (\textcolor{mildred}{-3\%}),0.73 (\textcolor{mildred}{-0\%}),0.67 (\textcolor{mildgreen}{+3\%}),0.81 (\textcolor{mildred}{-4\%})


In [5]:
# Create LaTeX table
column_format = 'l' + 'c' * (highlighted_comparison_df.shape[1] - 1)

latex_table = highlighted_comparison_df.to_latex(index=False, 
                                     caption=f'Comparison of Model Performance Metrics {cyber_directory_path.replace("_", " ")}', 
                                     label='tab:model_performance', 
                                     column_format=column_format, 
                                     longtable=False, 
                                     escape=False, 
                                     multicolumn=True, 
                                     multicolumn_format='c')

# Integrate \footnotesize and reduce column separation
latex_table = latex_table.replace('\\begin{table}', '\\begin{table}[htbp!]\n\\footnotesize\n\\setlength{\\tabcolsep}{4pt}')
# latex_table = latex_table.replace('\\textbf', '\\cellcolor[gray]{0.9}\\textbf') # to make every best value cell also grey
# colour last row in grey
latex_table = latex_table.replace('\\textbf{Token-base-pk-main}', '\\rowcolor[gray]{0.9}\\textbf{Token-base-pk-main}')

print(latex_table)

\begin{table}[htbp!]
\footnotesize
\setlength{\tabcolsep}{4pt}
\caption{Comparison of Model Performance Metrics CYBER I}
\label{tab:model_performance}
\begin{tabular}{lccccc}
\toprule
Model & Rouge-L-F $\uparrow$ & Jaccard & F1 & Precision & Recall \\
\midrule
GraphSeg & 0.61 (\textcolor{mildred}{-3\%}) & 0.50 (\textcolor{mildred}{-2\%}) & 0.63 (\textcolor{mildgreen}{+0\%}) & 0.54 (\textcolor{mildgreen}{+5\%}) & 0.74 (\textcolor{mildred}{-7\%}) \\
Snippets & 0.64 (\textcolor{mildred}{-2\%}) & 0.52 (\textcolor{mildred}{-3\%}) & 0.60 (\textcolor{mildred}{-3\%}) & 0.46 (\textcolor{mildred}{-4\%}) & 0.83 (\textcolor{mildred}{-1\%}) \\
Blocks & 0.77 (\textcolor{mildgreen}{+5\%}) & 0.73 (\textcolor{mildgreen}{+5\%}) & \textbf{0.80} (\textcolor{mildgreen}{+3\%}) & \textbf{0.75} (\textcolor{mildgreen}{+2\%}) & \textbf{0.86} (\textcolor{mildgreen}{+4\%}) \\
\rowcolor[gray]{0.9}\textbf{Token-base-pk-main} & \textbf{0.83} (\textcolor{mildred}{-3\%}) & \textbf{0.74} (\textcolor{mildred}{-3\%}) & 0