In [None]:
import seaborn as sns

# Set seaborn style
sns.set_theme(style="whitegrid", context="paper", font_scale=1.2)
sns.set_palette("colorblind")  # paper palette, colorblind-friendly

# Matplotlib settings
import matplotlib as mpl
mpl.rcParams['axes.labelweight'] = 'bold'
mpl.rcParams['axes.titlesize'] = 'x-large'
mpl.rcParams['xtick.labelsize'] = 'large'
mpl.rcParams['ytick.labelsize'] = 'large'
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['axes.labelsize'] = 'x-large'
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
combinations = [
    ("combo1", "Current prompt only", "blue"),
    ("combo2", "Current + preceding prompt", "green"),
    ("combo3", "Current + preceding + previous memories (same conversation)", "red"),
    ("combo4", "Current + preceding + previous memories + cross-conversation", "purple"),

]

In [None]:
string_level_results = pd.read_csv("../dummy_data/provenance_of_memories/exact_match_results.csv")
string_level_results.head()

In [None]:
avg_exact_match_combo1 = string_level_results["combo1_exact_match_rate"].mean()
print(f"Average exact match for combo1: {avg_exact_match_combo1:.4f}")


In [None]:
def create_exact_match_violin_plots(df, plots_dir):
    """Create violin plots for exact match metrics across combinations."""
    
    combinations = ['combo1', 'combo2', 'combo3', 'combo4']
    colors = ['lightblue', 'lightgreen', 'salmon', 'plum']
    labels = ['CM', 'CC', 'CLM', 'FUH']
    
    # Define exact match metrics
    metrics = [
        ('exact_match_rate', 'Exact Match Rate'),
    ]
    
    # Create violin plots for each metric
    for metric, metric_name in metrics:
        print(f"Creating violin plot for {metric_name}")
        fig, ax = plt.subplots(1, 1, figsize=(6,4))
        
        # Prepare data for violin plots
        data_to_plot = []
        for combo in combinations:
            data_to_plot.append(df[f'{combo}_{metric}'].dropna())
        
        # Create violin plot
        parts = ax.violinplot(data_to_plot, positions=range(1, len(combinations) + 1), 
                             showmeans=True, showmedians=True, showextrema=True)
        
        # Color
        for i, (pc, color) in enumerate(zip(parts['bodies'], colors)):
            pc.set_facecolor(color)
            pc.set_alpha(0.7)
        

        ax.set_ylabel(f'{metric_name}')
        ax.set_xticks(range(1, len(combinations) + 1))
        ax.set_xticklabels(labels, fontsize=12)
        ax.grid(True, alpha=0.3)
        
        # Set ylim
        if metric == 'exact_match_rate':
            ax.set_ylim(-0.01, 1.01)
        else:
            ax.set_ylim(0, 1.1)
        
        # Add mean values as text
        for i, combo in enumerate(combinations):
            mean_val = df[f'{combo}_{metric}'].mean()
            ax.text(i + 1, mean_val + 0.05, f'μ={mean_val:.3f}', 
                   ha='center', va='bottom', fontweight='bold', fontsize=11)
        
        plt.tight_layout()
        plt.savefig(f"{plots_dir}/{metric}_violin_comparison.pdf")
        
    

In [None]:
create_exact_match_violin_plots(string_level_results, "./")

### BLEUScore

In [None]:
bleu_results = pd.read_csv("../dummy_data/provenance_of_memories/bleu_results.csv")
bleu_results.head()


In [None]:
def create_bleu_comparison_plots(df, plots_dir):
    """Create violin plot for BLEU unigram precision across combinations."""
    
    combinations = ['combo1', 'combo2', 'combo3', 'combo4']
    colors = ['lightblue', 'lightgreen', 'salmon', 'plum']
    labels = ['CM', 'CC', 'CLM', 'FUH']
    
    metric = 'bleu_unigram_precision'
    metric_name = 'BLEU Unigram Precision'
    
    print(f"Creating violin plot for {metric_name}")
    fig, ax = plt.subplots(1, 1, figsize=(6,4))
    
    # Prepare data for violin plots
    data_to_plot = []
    for combo in combinations:
        data_to_plot.append(df[f'{combo}_{metric}'].dropna())
    
    # Create violin plot
    parts = ax.violinplot(data_to_plot, positions=range(1, len(combinations) + 1), 
                         showmeans=True, showmedians=True, showextrema=True)
    
    # Color
    for i, (pc, color) in enumerate(zip(parts['bodies'], colors)):
        pc.set_facecolor(color)
        pc.set_alpha(0.7)
    
    ax.set_ylabel(f'{metric_name}')
    ax.set_xticks(range(1, len(combinations) + 1))
    ax.set_xticklabels(labels, fontsize=12)
    ax.grid(True, alpha=0.3)
    
    # Set ylim
    ax.set_ylim(0, 1.1)
    
    # Add mean values as text
    for i, combo in enumerate(combinations):
        mean_val = df[f'{combo}_{metric}'].mean()
        ax.text(i + 1, mean_val + 0.05, f'μ={mean_val:.3f}', 
               ha='center', va='bottom', fontweight='bold', fontsize=11)
    
    plt.tight_layout()
    plt.savefig(f"{plots_dir}/{metric}_violin_comparison.pdf")

In [None]:
create_bleu_comparison_plots(bleu_results, "./")

### Model based analysis

In [None]:
model_results = pd.read_csv("../dummy_data/provenance_of_memories/model_based_agreement_results.csv")
model_results.head()

In [None]:
import numpy as np
import seaborn as sns

def create_agreement_score_box_plot(df, plots_dir):
    """
    Create a box plot summarizing model-based agreement_score (1-5 scale) across combinations.
    """

    combinations = ['combo1', 'combo2', 'combo3']
    colors = ['lightblue', 'lightgreen', 'salmon', 'plum']
    labels = ['CM', 'CC', 'CLM']
    metric = 'agreement_score'
    metric_name = 'LLM Agreement'

    # Prepare data for plotting
    data_to_plot = [df[f'{combo}_{metric}'].dropna() for combo in combinations]

    print(f"Creating summary boxplot for {metric_name}")
    fig, ax = plt.subplots(figsize=(6,4))

    plot_df = pd.DataFrame({
        'Agreement Score': np.concatenate(data_to_plot),
        'Combination': np.concatenate([[labels[i]] * len(data_to_plot[i]) for i in range(len(combinations))])
    })
    sns.boxplot(
        x='Combination',
        y='Agreement Score',
        data=plot_df,
        palette=colors,
        ax=ax,
        width=0.4,
        boxprops=dict(alpha=0.7),
        showfliers=True,
        showcaps=True,
        medianprops=dict(color='black')
    )
    # Remove x-axis label
    ax.set_xlabel("")
    ax.set_yticks([1,2,3,4,5])

    ax.set_ylabel(metric_name)
    # ax.set_title(f'{metric_name} Distribution by Combination', fontsize=13, fontweight='bold')
    ax.set_ylim(0.75, 5.25)
    ax.grid(True, alpha=0.3, axis='y')

    # Add mean values as text
    for i, combo in enumerate(combinations):
        mean_val = df[f'{combo}_{metric}'].mean()
        ax.text(i, mean_val + 0.15, f'μ={mean_val:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=11, color='black')
    
    plt.tight_layout()
    plt.savefig(f"{plots_dir}/{metric}_summary_boxplot.pdf")

In [None]:
create_agreement_score_box_plot(model_results, "./")

### Semantic Similarity

In [None]:
sem_results = pd.read_csv("../dummy_data/provenance_of_memories/semantic_similarity_results.csv")
sem_results.head()


In [None]:
def create_semantic_comparison_plots(df, plots_dir):
    """Create violin plot for semantic similarity across combinations."""
    
    combinations = ['combo1', 'combo2', 'combo3', 'combo4']
    colors = ['lightblue', 'lightgreen', 'salmon', 'plum']
    labels = ['CM', 'CC', 'CLM', 'FUH']
    
    metric = 'semantic_similarity'
    metric_name = 'Semantic Similarity'
    
    print(f"Creating violin plot for {metric_name}")
    fig, ax = plt.subplots(1, 1, figsize=(6,4))
    
    # Prepare data for violin plots
    data_to_plot = []
    for combo in combinations:
        data_to_plot.append(df[f'{combo}_{metric}'].dropna())
    
    # Create violin plot
    parts = ax.violinplot(data_to_plot, positions=range(1, len(combinations) + 1), 
                         showmeans=True, showmedians=True, showextrema=True)
    
    # Color
    for i, (pc, color) in enumerate(zip(parts['bodies'], colors)):
        pc.set_facecolor(color)
        pc.set_alpha(0.7)
    
    ax.set_ylabel(f'{metric_name}')
    ax.set_xticks(range(1, len(combinations) + 1))
    ax.set_xticklabels(labels, fontsize=12)
    ax.grid(True, alpha=0.3)
    
    # Set ylim
    ax.set_ylim(0, 1.1)
    
    # Add mean values as text
    for i, combo in enumerate(combinations):
        mean_val = df[f'{combo}_{metric}'].mean()
        ax.text(i + 1, mean_val + 0.05, f'μ={mean_val:.3f}', 
               ha='center', va='bottom', fontweight='bold', fontsize=11)
    
    plt.tight_layout()
    plt.savefig(f"{plots_dir}/{metric}_violin_comparison.pdf")


In [None]:
    
create_semantic_comparison_plots(sem_results, "./")