In [None]:
import ast
import os
import json
import pandas as pd
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tiktoken
import matplotlib as mpl
import seaborn.objects as so
from enum import Enum
import Datasets
from seaborn import axes_style
from scipy.stats import linregress
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

from adjustText import adjust_text

# WebNLG plots

In [None]:
scores_webnlg = pd.read_csv('results_llm_testing/results_llms.csv')
scores_webnlg

In [None]:
def plot_webnlg_results(metric, metric_name, metric_name_axis=None):
    if metric_name_axis is None:
        metric_name_axis = metric_name
    
    scores_webnlg = pd.read_csv('results_llm_testing/results_llms.csv')
    scores_webnlg["Examples provided"] = scores_webnlg["Examples provided"].astype(str)
    scores_webnlg["In-Context examples"] = scores_webnlg["Examples provided"]
    scores_webnlg["Invalid outputs"] = scores_webnlg["Invalid outputs"] / 7253 # number of test samples in WebNLG

    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Gemma-2 (2B)", "Gemma-2\n(2B)")
    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Llama-3.2 (3B)", "Llama-3.2\n(3B)")
    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Phi-3.5 (3.8B)", "Phi-3.5\n(3.8B)")
    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Llama-3.1 (8B)", "Llama-3.1\n(8B)")
    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Gemma-2 (9B)", "Gemma-2\n(9B)")
    scores_webnlg['LLM'] = scores_webnlg['LLM'].replace("Gemini", "Gemini\n2.0 Flash")

    plot = (
        so.Plot(scores_webnlg, x="LLM", y=metric, color="In-Context examples") # doesn't work at all
            .add(so.Bar(), so.Dodge(empty="drop"))

            .scale(color="Paired", y=so.Continuous().tick(every=0.1))
            .layout(size=(14, 12), engine="tight")
            .limit()
            .label(x="Model", y=metric_name_axis, title=f"{metric_name} for the WebNLG Evaluation")
    )

    plot = plot.theme({
        "xtick.labelsize": 32,  
        "ytick.labelsize": 32,  
        "axes.labelsize": 32,   
        "axes.titlesize": 34,   
        "legend.title_fontsize": 26,  
        "legend.fontsize": 26,  
        "axes.titlepad": 25, 
        "axes.labelpad": 10, 
        **axes_style("ticks"), # White background, ticks on axes, no grid
        "axes.grid": True, # Add the grid
    })

    # Hack to be able to move the legend, seaborn objects doesn't allow this properly yet
    f = mpl.figure.Figure(figsize=(14, 12))
    res = plot.on(f).plot()
    f.legends[0].set_bbox_to_anchor((0.65, 0.8))
    f.axes[0].set_ylim(0, 1)
    f.axes[0].tick_params(axis='x', rotation=45)

    f.canvas.flush_events()
    f.savefig(f"webnlg_{metric_name.lower().replace(' ', '_')}.png",  bbox_inches='tight')
    return f

In [None]:
plot_webnlg_results("Precision (strict)", "Precision (Strict) results")
plot_webnlg_results("Precision (relaxed)", "Precision (Relaxed) results")
plot_webnlg_results("Avg. Rouge-2 (space separators)", "ROUGE-2 results")
plot_webnlg_results("Invalid outputs", "Error Rate", metric_name_axis="Error Rate")

# DocRED and Biomedical Papers

In [None]:
# Possible prompt strategies to do
# PREVIOUS_SENTENCES: Split the text into chunks of n sentences,
#                     with an overlap of m sentences which will
#                     act as the context. n and m can be any
#                     value (e.g. n=1, m=0 traverses it sentence
#                     by sentence with no context). The iterator
#                     will adjust the overlap in the first sentence(s)
#                     where there may not be enough preceding ones
#
# SECTION_CONTENTS: Generate triples for the whole section at once
PromptStrategy = Enum('PromptStrategy', ['PREVIOUS_SENTENCES', 'SECTION_CONTENTS'])

## General barplot

In [None]:
def plot_amount_of_triples_generated_tests_model_comparison(csv_path_model_1,
                                                            csv_path_model_2,
                                                            csv_path_model_3,
                                                            model_name_1,
                                                            model_name_2,
                                                            model_name_3,
                                                            type_of_text):
    df_1 = pd.read_csv(csv_path_model_1).sort_values(['sentences_per_prompt', 'overlap'])
    df_1["Model"] = model_name_1
    df_2 = pd.read_csv(csv_path_model_2).sort_values(['sentences_per_prompt', 'overlap'])
    df_2["Model"] = model_name_2
    df_3 = pd.read_csv(csv_path_model_3).sort_values(['sentences_per_prompt', 'overlap'])
    df_3["Model"] = model_name_3

    df_1_contents = df_1[(df_1["prompt_strategy"] == str(PromptStrategy.SECTION_CONTENTS)) & (df_1["sentences_per_prompt"] == 0.0)]
    df_2_contents = df_2[(df_2["prompt_strategy"] == str(PromptStrategy.SECTION_CONTENTS)) & (df_2["sentences_per_prompt"] == 0.0)]
    df_3_contents = df_3[(df_3["prompt_strategy"] == str(PromptStrategy.SECTION_CONTENTS)) & (df_3["sentences_per_prompt"] == 0.0)]

    df_1_sentences = df_1[df_1["prompt_strategy"] == str(PromptStrategy.PREVIOUS_SENTENCES)]
    df_2_sentences = df_2[df_2["prompt_strategy"] == str(PromptStrategy.PREVIOUS_SENTENCES)]
    df_3_sentences = df_3[df_3["prompt_strategy"] == str(PromptStrategy.PREVIOUS_SENTENCES)]

    combined_df_contents = pd.concat([df_1_contents, df_2_contents, df_3_contents], ignore_index=True)

    avg_model_1 = df_1_sentences.groupby(['Model'])['total_triples'].mean().reset_index().rename(columns={"total_triples": "avg"})
    avg_model_2 = df_2_sentences.groupby(['Model'])['total_triples'].mean().reset_index().rename(columns={"total_triples": "avg"})
    avg_model_3 = df_3_sentences.groupby(['Model'])['total_triples'].mean().reset_index().rename(columns={"total_triples": "avg"})

    plot = (
        so.Plot(combined_df_contents, x="Model", y="total_triples")
            .add(so.Bar(alpha=0.9, width=1.6), so.Dodge(empty="drop"), color="Model", legend=True, label="Paragraph")

            .add(so.Bar(alpha=0.25, edgestyle='--', width=0.5300),
                 so.Dodge(empty="drop"), data=avg_model_1, x="Model", y="avg", color="Model", legend=True, label="Sliding Window")
            .add(so.Bar(alpha=0.25, edgestyle='--', width=0.5300),
                 so.Dodge(empty="drop"), data=avg_model_2, x="Model", y="avg", color="Model", legend=True)
            .add(so.Bar(alpha=0.25, edgestyle='--', width=0.5300),
                 so.Dodge(empty="drop"), data=avg_model_3, x="Model", y="avg", color="Model", legend=True)

            .add(so.Dot(pointsize=10, alpha=1), data=avg_model_1, x="Model", y="avg", color="Model", legend=False)
            .add(so.Dot(pointsize=10, alpha=1), data=avg_model_2, x="Model", y="avg", color="Model", legend=False)
            .add(so.Dot(pointsize=10, alpha=1), data=avg_model_3, x="Model", y="avg", color="Model", legend=False)

            .add(so.Range(linewidth=8, alpha=0.45), so.Est(errorbar="sd"),
                 x="Model", y="total_triples", data=df_1_sentences[["Model", "total_triples"]], color="Model", legend=False)
            .add(so.Range(linewidth=8, alpha=0.45), so.Est(errorbar="sd"), 
                 x="Model", y="total_triples", data=df_2_sentences[["Model", "total_triples"]], color="Model", legend=False)
            .add(so.Range(linewidth=8, alpha=0.45), so.Est(errorbar="sd"), 
                 x="Model", y="total_triples", data=df_3_sentences[["Model", "total_triples"]], color="Model", legend=False)

            .scale(color="Paired", x=so.Nominal(), y=so.Continuous().tick(every=5))
            .layout(size=(16, 12), engine="tight")
            .limit()
            .label(x="Model", y="Avg. Generated Triples", legend="Strategy", title=f"Effect Of the Choice of Model on the Avg. Generated Triples\n({type_of_text})")
    )

    plot = plot.theme({
        "xtick.labelsize": 32,
        "ytick.labelsize": 32,
        "axes.labelsize": 32,
        "axes.titlesize": 34,
        "legend.title_fontsize": 26,
        "legend.fontsize": 26,
        "axes.titlepad": 25,
        "axes.labelpad": 10,
        **axes_style("ticks"), # White background, ticks on axes, no grid
        "axes.grid": True, # Add the grid
    })

    # Hack to be able to move the legend, seaborn objects doesn't allow this properly yet
    f = mpl.figure.Figure(figsize=(14, 12))
    res = plot.on(f).plot()
    f.legends[0].set_bbox_to_anchor((0.1, 0.71))

    f.canvas.flush_events()
    f.savefig(f"amount_of_triples_test_models_comparison_{type_of_text.lower().replace(' ', '_')}.png",  bbox_inches='tight')
    return f

In [None]:
plot_amount_of_triples_generated_tests_model_comparison("results_paper_triples/biomedical_papers/phi_3b/paper_triples_results_clean.csv", 
                                                        "results_paper_triples/biomedical_papers/llama_8b/paper_triples_results_clean.csv",
                                                        "results_paper_triples/biomedical_papers/gemma_9b/paper_triples_results_clean.csv",

                                                        "Phi-3.5 (3.8B)",
                                                        "Llama-3.1 (8B)",
                                                        "Gemma-2 (9B)",

                                                        "Biomedical Texts")

plot_amount_of_triples_generated_tests_model_comparison("results_paper_triples/docred/phi_3b/paper_triples_results_clean.csv", 
                                                        "results_paper_triples/docred/llama_8b/paper_triples_results_clean.csv",
                                                        "results_paper_triples/docred/gemma_9b/paper_triples_results_clean.csv",

                                                        "Phi-3.5 (3.8B)",
                                                        "Llama-3.1 (8B)",
                                                        "Gemma-2 (9B)",

                                                        "DocRED")

## Per-sentences, number of triples barplot

In [None]:
def get_avg_triples_fit(path):
    df = pd.read_csv(path)
    df = df.sort_values(by=['sentences_per_prompt', 'overlap'])
    df = df[df["sentences_per_prompt"] > 0]  # Don't show SECTION_SENTENCES
    df['overlap'] = df['overlap'].astype(int)

    # Calculate average and standard deviation per group
    avg = df.groupby(['sentences_per_prompt'])['total_triples'].mean().reset_index()
    stddev = df.groupby(['sentences_per_prompt'])['total_triples'].std().reset_index()

    avg = avg.rename(columns={"total_triples": "avg"})
    stddev = stddev.rename(columns={"total_triples": "std"})


    merged_df = pd.merge(avg, stddev, on='sentences_per_prompt')

    x = merged_df['sentences_per_prompt']
    y = merged_df['avg']

    slope, intercept, r_value, p_value, std_err = linregress(x, y)


    return slope, p_value, slope * x + intercept

In [None]:
def plot_amount_of_triples_generated_tests_bars_stddev(csv_path, model_name, type_of_text):
    df = pd.read_csv(csv_path)
    df = df.sort_values(by=['sentences_per_prompt', 'overlap'])
    df = df[df["sentences_per_prompt"] > 0]  # Don't show SECTION_SENTENCES
    df['overlap'] = df['overlap'].astype(int)

    # Calculate per-group average
    avg = df.groupby(['sentences_per_prompt'])['total_triples'].mean().reset_index()
    avg = avg.rename(columns={"total_triples": "avg"})

    df['Overlap'] = df['overlap'].apply(lambda x: f"{x} Ctx. Sentences" if x != 0 else "No Context")

    slope, p_value, regression_fit = get_avg_triples_fit(csv_path)

    fit_df = pd.DataFrame({
        "sentences_per_prompt": avg["sentences_per_prompt"],
        "fitted": regression_fit
    })

    df = df.rename(columns={"Overlap": "Overlap ($m$)"})

    plot = (
        so.Plot(df, x="sentences_per_prompt", y="total_triples")

            .add(so.Line(linestyle='--', color='dimgrey'), data=fit_df, x="sentences_per_prompt", y="fitted")  # Regression fit

            .add(so.Bar(), so.Dodge(empty="drop"), color="Overlap ($m$)", legend=True)

            .add(so.Dot(pointsize=5, color="black"), data=avg, x="sentences_per_prompt", y="avg")
            .add(so.Range(linewidth=2, color="black"), so.Est(errorbar="sd"), data=df[["sentences_per_prompt", "total_triples"]])

            .scale(color="Paired", x=so.Continuous().tick(every=1), y=so.Continuous().tick(every=10))
            .layout(size=(16, 18), engine="tight")
            .limit()
            .label(x="Sentences per Prompt ($k$)", y="Avg. Generated Triples", legend="Overlap ($m$)", 
                   title=f"Sliding Window Strategy Effect on the Avg. Generated Triples\n({model_name}, {type_of_text}) ($m$={slope:.3f}, $p$={p_value:.1e})")
    )

    plot = plot.theme({
        "xtick.labelsize": 32,
        "ytick.labelsize": 32,
        "axes.labelsize": 32,
        "axes.titlesize": 34,
        "legend.title_fontsize": 26,
        "legend.fontsize": 26,
        "axes.titlepad": 25,
        "axes.labelpad": 10,
        **axes_style("ticks"), # White background, ticks on axes, no grid
        "axes.grid": True, # Add the grid
    })

    # Hack to be able to move the legend, seaborn objects doesn't allow this properly yet
    f = mpl.figure.Figure(figsize=(14, 12))
    res = plot.on(f).plot()
    f.legends[0].set_bbox_to_anchor((0.7, 0.67))
    f.axes[0].set_ylim(0, 100)

    f.canvas.flush_events()
    f.savefig(f"amount_of_triples_test_{model_name.lower().replace(' ', '_')}_{type_of_text.lower().replace(' ', '_')}_bars.png",  bbox_inches='tight')
    return f

plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/biomedical_papers/gemma_9b/paper_triples_results_clean.csv", "Gemma 9B", "Biomed. Papers")
plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/docred/gemma_9b/paper_triples_results_clean.csv", "Gemma 9B", "DocRED")

plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/docred/llama_8b/paper_triples_results_clean.csv", "Llama 8B", "DocRED")
plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/biomedical_papers/llama_8b/paper_triples_results_clean.csv", "Llama 8B", "Biomed. Papers")

plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/biomedical_papers/phi_3b/paper_triples_results_clean.csv", "Phi 3B", "Biomed. Papers")
plot_amount_of_triples_generated_tests_bars_stddev("results_paper_triples/docred/phi_3b/paper_triples_results_clean.csv", "Phi 3B", "DocRED")

## Per-sentences, error rate barplot

In [None]:
def get_error_rate_fit(path):
    df = pd.read_csv(path)
    df = df.sort_values(by=['sentences_per_prompt', 'overlap'])
    df = df[df["sentences_per_prompt"] > 0]  # Don't show SECTION_SENTENCES
    df['overlap'] = df['overlap'].astype(int)

    # Show the ratio over the total
    df['bad_triples'] = df['bad_triples'] / df['total_triples']

    # Calculate average and standard deviation per group
    avg = df.groupby(['sentences_per_prompt'])['bad_triples'].mean().reset_index()
    stddev = df.groupby(['sentences_per_prompt'])['bad_triples'].std().reset_index()

    avg = avg.rename(columns={"bad_triples": "avg"})
    stddev = stddev.rename(columns={"bad_triples": "std"})


    merged_df = pd.merge(avg, stddev, on='sentences_per_prompt')

    x = merged_df['sentences_per_prompt']
    y = merged_df['avg']

    slope, intercept, r_value, p_value, std_err = linregress(x, y)

    return slope, p_value, slope * x + intercept

In [None]:
def plot_error_rates_bars_stddev(csv_path, model_name, type_of_text):
    df = pd.read_csv(csv_path)
    df = df.sort_values(by=['sentences_per_prompt', 'overlap'])
    df = df[df["sentences_per_prompt"] > 0]  # Don't show SECTION_SENTENCES
    df['overlap'] = df['overlap'].astype(int)

    # Show the ratio over the total
    df['bad_triples'] = df['bad_triples'] / df['total_triples']

    # Calculate average and standard deviation per group
    avg = df.groupby(['sentences_per_prompt'])['bad_triples'].mean().reset_index()
    avg = avg.rename(columns={"bad_triples": "avg"})

    df['Overlap'] = df['overlap'].apply(lambda x: f"{x} Ctx. Sentences" if x != 0 else "No Context")

    global_avg = df.assign(sentences_per_prompt=5.5)[["sentences_per_prompt", "bad_triples"]].mean().to_frame().T

    slope, p_value, regression_fit = get_error_rate_fit(csv_path)

    fit_df = pd.DataFrame({
        "sentences_per_prompt": avg["sentences_per_prompt"],
        "fitted": regression_fit
    })

    df = df.rename(columns={"Overlap": "Overlap ($m$)"})

    plot = (
        so.Plot(df, x="sentences_per_prompt", y="bad_triples")
            .add(so.Line(linestyle='--', color='dimgrey'), data=fit_df, x="sentences_per_prompt", y="fitted")  # Regression fit

            .add(so.Bar(), so.Dodge(empty="drop"), color="Overlap ($m$)", legend=True)

            .add(so.Dot(pointsize=5, color="black"), data=avg, x="sentences_per_prompt", y="avg")
            .add(so.Range(linewidth=2, color="black"), so.Est(errorbar="sd"), data=df[["sentences_per_prompt", "bad_triples"]])

            .scale(color="Paired", x=so.Continuous().tick(every=1), y=so.Continuous().tick(every=0.002))
            .layout(size=(16, 18), engine="tight")
            .limit()
            .label(x="Sentences per Prompt ($k$)", y="Avg. Incorrect Triples (Ratio Over Total)", legend="Overlap ($m$)", 
                   title=f"Sliding Window Strategy Effect on the Error Rate\n({model_name}, {type_of_text}) ($m$={slope:.3f}, $p$={p_value:.1e})")
    )

    plot = plot.theme({
        "xtick.labelsize": 32,
        "ytick.labelsize": 32,
        "axes.labelsize": 32,
        "axes.titlesize": 34,
        "legend.title_fontsize": 26,
        "legend.fontsize": 26,
        "axes.titlepad": 25,
        "axes.labelpad": 10,
        **axes_style("ticks"), # White background, ticks on axes, no grid
        "axes.grid": True, # Add the grid
    })

    # Hack to be able to move the legend, seaborn objects doesn't allow this properly yet
    f = mpl.figure.Figure(figsize=(14, 12))
    res = plot.on(f).plot()
    f.legends[0].set_bbox_to_anchor((0.14, 0.66))
    f.axes[0].set_ylim(0, 0.03)

    f.canvas.flush_events()
    f.savefig(f"error_rates_test_{model_name.lower().replace(' ', '_')}_{type_of_text.lower().replace(' ', '_')}_bars.png",  bbox_inches='tight')
    return f


plot_error_rates_bars_stddev("results_paper_triples/biomedical_papers/gemma_9b/paper_triples_results_clean.csv", "Gemma 9B", "Biomed. Papers")
plot_error_rates_bars_stddev("results_paper_triples/docred/gemma_9b/paper_triples_results_clean.csv", "Gemma 9B", "DocRED")

plot_error_rates_bars_stddev("results_paper_triples/docred/llama_8b/paper_triples_results_clean.csv", "Llama 8B", "DocRED")
plot_error_rates_bars_stddev("results_paper_triples/biomedical_papers/llama_8b/paper_triples_results_clean.csv", "Llama 8B", "Biomed. Papers")

plot_error_rates_bars_stddev("results_paper_triples/biomedical_papers/phi_3b/paper_triples_results_clean.csv", "Phi 3B", "Biomed. Papers")
plot_error_rates_bars_stddev("results_paper_triples/docred/phi_3b/paper_triples_results_clean.csv", "Phi 3B", "DocRED")