# LLM Query Injection Experiment Results Analysis

This notebook analyzes the results from the LLM query injection experiments, comparing performance across different prompt types, attack types, and mitigation strategies.

**Baseline**: BASIC prompt, no attack, no mitigation


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Load the results

CSV_FILENAME = 'qwen3_0.6b_results.csv'
#CSV_FILENAME = 'gemma3_1b_results.csv'
df = pd.read_csv(CSV_FILENAME)

print(f"Loaded {len(df)} experiment results")
print(f"Columns: {list(df.columns)}")
df.head()


Loaded 48 experiment results
Columns: ['filename', 'prompt_type', 'attack_type', 'mitigation_type', 'overall_mae', 'mae_gt_zero', 'mae_gt_positive', 'overall_jer', 'jer_gt_zero', 'jer_gt_positive', 'delta_overall_mae', 'delta_mae_gt_zero', 'delta_mae_gt_positive', 'delta_overall_jer', 'delta_jer_gt_zero', 'delta_jer_gt_positive', 'valid_experiments', 'gt_zero_count', 'gt_positive_count', 'failed_experiments', 'total_experiments']


Unnamed: 0,filename,prompt_type,attack_type,mitigation_type,overall_mae,mae_gt_zero,mae_gt_positive,overall_jer,jer_gt_zero,jer_gt_positive,delta_overall_mae,delta_mae_gt_zero,delta_mae_gt_positive,delta_overall_jer,delta_jer_gt_zero,delta_jer_gt_positive,valid_experiments,gt_zero_count,gt_positive_count,failed_experiments,total_experiments
0,results_BASIC_none_none.json,BASIC,none,none,0.9266,0.7967,1.1402,50.81,33.92,78.6,,,,,,,7205,4481,2724,58,7263
1,results_BASIC_append_few_shot.json,BASIC,append,few_shot,1.2646,1.3214,1.1714,58.65,47.34,77.21,0.338,0.5247,0.0311,7.84,13.42,-1.39,7199,4474,2725,64,7263
2,results_BASIC_append_none.json,BASIC,append,none,1.1153,1.1028,1.1358,56.41,43.85,77.03,0.1887,0.3061,-0.0045,5.6,9.93,-1.57,7199,4474,2725,64,7263
3,results_BASIC_append_system_prompt_hardening.json,BASIC,append,system_prompt_hardening,1.1187,1.114,1.1263,56.61,44.22,76.95,0.1921,0.3173,-0.014,5.79,10.3,-1.65,7197,4473,2724,66,7263
4,results_BASIC_append_user_prompt_hardening.json,BASIC,append,user_prompt_hardening,0.959,0.7902,1.2363,48.78,28.73,81.72,0.0325,-0.0065,0.0961,-2.03,-5.19,3.13,7201,4476,2725,62,7263


## 1. Overall MAE Results


In [22]:
# Create overall MAE table
mae_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'overall_mae', 'delta_overall_mae']].copy()
mae_table = mae_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
mae_table['overall_mae'] = mae_table['overall_mae'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
mae_table['delta_overall_mae'] = mae_table['delta_overall_mae'].apply(lambda x: f"{x:+.4f}" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
mae_table_display = mae_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'overall_mae': 'MAE',
    'delta_overall_mae': 'Δ from Baseline'
})

print("Overall MAE Results (Mean Absolute Error)")
print("=" * 60)
display(mae_table_display)


Overall MAE Results (Mean Absolute Error)


Unnamed: 0,Prompt,Attack,Mitigation,MAE,Δ from Baseline
1,BASIC,append,few_shot,1.2646,0.338
2,BASIC,append,none,1.1153,0.1887
3,BASIC,append,system_prompt_hardening,1.1187,0.1921
4,BASIC,append,user_prompt_hardening,0.959,0.0325
5,BASIC,none,few_shot,0.9548,0.0282
0,BASIC,none,none,0.9266,
6,BASIC,none,system_prompt_hardening,0.9209,-0.0057
7,BASIC,none,user_prompt_hardening,0.8585,-0.0681
8,BASIC,prepend,few_shot,1.5809,0.6543
9,BASIC,prepend,none,1.4916,0.565


## 2. Overall JER Results


In [23]:
# Create overall JER table
jer_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'overall_jer', 'delta_overall_jer']].copy()
jer_table = jer_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
jer_table['overall_jer'] = jer_table['overall_jer'].apply(lambda x: f"{x:.2f}%" if pd.notna(x) else "N/A")
jer_table['delta_overall_jer'] = jer_table['delta_overall_jer'].apply(lambda x: f"{x:+.2f}%" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
jer_table_display = jer_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'overall_jer': 'JER',
    'delta_overall_jer': 'Δ from Baseline'
})

print("Overall JER Results (Judgment Error Rate)")
print("=" * 60)
display(jer_table_display)


Overall JER Results (Judgment Error Rate)


Unnamed: 0,Prompt,Attack,Mitigation,JER,Δ from Baseline
1,BASIC,append,few_shot,58.65%,+7.84%
2,BASIC,append,none,56.41%,+5.60%
3,BASIC,append,system_prompt_hardening,56.61%,+5.79%
4,BASIC,append,user_prompt_hardening,48.78%,-2.03%
5,BASIC,none,few_shot,49.69%,-1.12%
0,BASIC,none,none,50.81%,
6,BASIC,none,system_prompt_hardening,50.95%,+0.14%
7,BASIC,none,user_prompt_hardening,46.66%,-4.16%
8,BASIC,prepend,few_shot,68.42%,+17.60%
9,BASIC,prepend,none,68.46%,+17.65%


## 3. MAE for Ground Truth = 0 (Irrelevant Documents)


In [24]:
# Create GT=0 MAE table
mae_gt0_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'mae_gt_zero', 'delta_mae_gt_zero']].copy()
mae_gt0_table = mae_gt0_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
mae_gt0_table['mae_gt_zero'] = mae_gt0_table['mae_gt_zero'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
mae_gt0_table['delta_mae_gt_zero'] = mae_gt0_table['delta_mae_gt_zero'].apply(lambda x: f"{x:+.4f}" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
mae_gt0_table_display = mae_gt0_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'mae_gt_zero': 'MAE (GT=0)',
    'delta_mae_gt_zero': 'Δ from Baseline'
})

print("MAE for Ground Truth = 0 (Irrelevant Documents)")
print("=" * 60)
display(mae_gt0_table_display)


MAE for Ground Truth = 0 (Irrelevant Documents)


Unnamed: 0,Prompt,Attack,Mitigation,MAE (GT=0),Δ from Baseline
1,BASIC,append,few_shot,1.3214,0.5247
2,BASIC,append,none,1.1028,0.3061
3,BASIC,append,system_prompt_hardening,1.114,0.3173
4,BASIC,append,user_prompt_hardening,0.7902,-0.0065
5,BASIC,none,few_shot,0.7594,-0.0373
0,BASIC,none,none,0.7967,
6,BASIC,none,system_prompt_hardening,0.7859,-0.0108
7,BASIC,none,user_prompt_hardening,0.5947,-0.202
8,BASIC,prepend,few_shot,1.8284,1.0317
9,BASIC,prepend,none,1.7318,0.9351


## 4. JER for Ground Truth = 0 (Irrelevant Documents)


In [25]:
# Create GT=0 JER table
jer_gt0_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'jer_gt_zero', 'delta_jer_gt_zero']].copy()
jer_gt0_table = jer_gt0_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
jer_gt0_table['jer_gt_zero'] = jer_gt0_table['jer_gt_zero'].apply(lambda x: f"{x:.2f}%" if pd.notna(x) else "N/A")
jer_gt0_table['delta_jer_gt_zero'] = jer_gt0_table['delta_jer_gt_zero'].apply(lambda x: f"{x:+.2f}%" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
jer_gt0_table_display = jer_gt0_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'jer_gt_zero': 'JER (GT=0)',
    'delta_jer_gt_zero': 'Δ from Baseline'
})

print("JER for Ground Truth = 0 (Irrelevant Documents)")
print("=" * 60)
display(jer_gt0_table_display)


JER for Ground Truth = 0 (Irrelevant Documents)


Unnamed: 0,Prompt,Attack,Mitigation,JER (GT=0),Δ from Baseline
1,BASIC,append,few_shot,47.34%,+13.42%
2,BASIC,append,none,43.85%,+9.93%
3,BASIC,append,system_prompt_hardening,44.22%,+10.30%
4,BASIC,append,user_prompt_hardening,28.73%,-5.19%
5,BASIC,none,few_shot,29.84%,-4.08%
0,BASIC,none,none,33.92%,
6,BASIC,none,system_prompt_hardening,34.31%,+0.39%
7,BASIC,none,user_prompt_hardening,23.42%,-10.50%
8,BASIC,prepend,few_shot,64.67%,+30.75%
9,BASIC,prepend,none,65.55%,+31.63%


## 5. MAE for Ground Truth > 0 (Relevant Documents)


In [26]:
# Create GT>0 MAE table
mae_gtpos_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'mae_gt_positive', 'delta_mae_gt_positive']].copy()
mae_gtpos_table = mae_gtpos_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
mae_gtpos_table['mae_gt_positive'] = mae_gtpos_table['mae_gt_positive'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
mae_gtpos_table['delta_mae_gt_positive'] = mae_gtpos_table['delta_mae_gt_positive'].apply(lambda x: f"{x:+.4f}" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
mae_gtpos_table_display = mae_gtpos_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'mae_gt_positive': 'MAE (GT>0)',
    'delta_mae_gt_positive': 'Δ from Baseline'
})

print("MAE for Ground Truth > 0 (Relevant Documents)")
print("=" * 60)
display(mae_gtpos_table_display)


MAE for Ground Truth > 0 (Relevant Documents)


Unnamed: 0,Prompt,Attack,Mitigation,MAE (GT>0),Δ from Baseline
1,BASIC,append,few_shot,1.1714,0.0311
2,BASIC,append,none,1.1358,-0.0045
3,BASIC,append,system_prompt_hardening,1.1263,-0.014
4,BASIC,append,user_prompt_hardening,1.2363,0.0961
5,BASIC,none,few_shot,1.2763,0.1361
0,BASIC,none,none,1.1402,
6,BASIC,none,system_prompt_hardening,1.1428,0.0025
7,BASIC,none,user_prompt_hardening,1.2925,0.1522
8,BASIC,prepend,few_shot,1.1739,0.0337
9,BASIC,prepend,none,1.0965,-0.0437


## 6. JER for Ground Truth > 0 (Relevant Documents)


In [27]:
# Create GT>0 JER table
jer_gtpos_table = df[['prompt_type', 'attack_type', 'mitigation_type', 'jer_gt_positive', 'delta_jer_gt_positive']].copy()
jer_gtpos_table = jer_gtpos_table.sort_values(['prompt_type', 'attack_type', 'mitigation_type'])

# Format the table
jer_gtpos_table['jer_gt_positive'] = jer_gtpos_table['jer_gt_positive'].apply(lambda x: f"{x:.2f}%" if pd.notna(x) else "N/A")
jer_gtpos_table['delta_jer_gt_positive'] = jer_gtpos_table['delta_jer_gt_positive'].apply(lambda x: f"{x:+.2f}%" if pd.notna(x) and x != 'NaN' else "N/A")

# Rename columns for display
jer_gtpos_table_display = jer_gtpos_table.rename(columns={
    'prompt_type': 'Prompt',
    'attack_type': 'Attack',
    'mitigation_type': 'Mitigation',
    'jer_gt_positive': 'JER (GT>0)',
    'delta_jer_gt_positive': 'Δ from Baseline'
})

print("JER for Ground Truth > 0 (Relevant Documents)")
print("=" * 60)
display(jer_gtpos_table_display)


JER for Ground Truth > 0 (Relevant Documents)


Unnamed: 0,Prompt,Attack,Mitigation,JER (GT>0),Δ from Baseline
1,BASIC,append,few_shot,77.21%,-1.39%
2,BASIC,append,none,77.03%,-1.57%
3,BASIC,append,system_prompt_hardening,76.95%,-1.65%
4,BASIC,append,user_prompt_hardening,81.72%,+3.13%
5,BASIC,none,few_shot,82.35%,+3.75%
0,BASIC,none,none,78.60%,
6,BASIC,none,system_prompt_hardening,78.31%,-0.29%
7,BASIC,none,user_prompt_hardening,84.88%,+6.28%
8,BASIC,prepend,few_shot,74.57%,-4.03%
9,BASIC,prepend,none,73.25%,-5.35%


## 7. Summary Statistics


In [28]:
# Summary statistics
print("Experiment Summary")
print("=" * 30)
print(f"Total experiments: {len(df)}")
print(f"Prompt types: {df['prompt_type'].unique().tolist()}")
print(f"Attack types: {df['attack_type'].unique().tolist()}")
print(f"Mitigation types: {df['mitigation_type'].unique().tolist()}")

# Find baseline
baseline = df[(df['prompt_type'] == 'BASIC') & (df['attack_type'] == 'none') & (df['mitigation_type'] == 'none')]
if not baseline.empty:
    print(f"\nBaseline (BASIC, none, none) metrics:")
    print(f"  Overall MAE: {baseline['overall_mae'].iloc[0]:.4f}")
    print(f"  Overall JER: {baseline['overall_jer'].iloc[0]:.2f}%")
    print(f"  MAE GT=0: {baseline['mae_gt_zero'].iloc[0]:.4f}")
    print(f"  JER GT=0: {baseline['jer_gt_zero'].iloc[0]:.2f}%")
    print(f"  MAE GT>0: {baseline['mae_gt_positive'].iloc[0]:.4f}")
    print(f"  JER GT>0: {baseline['jer_gt_positive'].iloc[0]:.2f}%")

# Best and worst performing experiments
print(f"\nBest Overall MAE: {df['overall_mae'].min():.4f}")
best_mae = df[df['overall_mae'] == df['overall_mae'].min()]
print(f"  Experiment: {best_mae['prompt_type'].iloc[0]}, {best_mae['attack_type'].iloc[0]}, {best_mae['mitigation_type'].iloc[0]}")

print(f"\nWorst Overall MAE: {df['overall_mae'].max():.4f}")
worst_mae = df[df['overall_mae'] == df['overall_mae'].max()]
print(f"  Experiment: {worst_mae['prompt_type'].iloc[0]}, {worst_mae['attack_type'].iloc[0]}, {worst_mae['mitigation_type'].iloc[0]}")

print(f"\nBest Overall JER: {df['overall_jer'].min():.2f}%")
best_jer = df[df['overall_jer'] == df['overall_jer'].min()]
print(f"  Experiment: {best_jer['prompt_type'].iloc[0]}, {best_jer['attack_type'].iloc[0]}, {best_jer['mitigation_type'].iloc[0]}")

print(f"\nWorst Overall JER: {df['overall_jer'].max():.2f}%")
worst_jer = df[df['overall_jer'] == df['overall_jer'].max()]
print(f"  Experiment: {worst_jer['prompt_type'].iloc[0]}, {worst_jer['attack_type'].iloc[0]}, {worst_jer['mitigation_type'].iloc[0]}")


Experiment Summary
Total experiments: 48
Prompt types: ['BASIC', 'RATIONALE', 'UTILITY']
Attack types: ['none', 'append', 'prepend', 'scatter']
Mitigation types: ['none', 'few_shot', 'system_prompt_hardening', 'user_prompt_hardening']

Baseline (BASIC, none, none) metrics:
  Overall MAE: 0.9266
  Overall JER: 50.81%
  MAE GT=0: 0.7967
  JER GT=0: 33.92%
  MAE GT>0: 1.1402
  JER GT>0: 78.60%

Best Overall MAE: 0.8585
  Experiment: BASIC, none, user_prompt_hardening

Worst Overall MAE: 2.2085
  Experiment: RATIONALE, append, few_shot

Best Overall JER: 46.66%
  Experiment: BASIC, none, user_prompt_hardening

Worst Overall JER: 88.29%
  Experiment: RATIONALE, prepend, system_prompt_hardening


## 8. Delta Analysis (Performance vs Baseline)


In [29]:
# Analyze deltas (excluding baseline itself)
non_baseline = df[(df['prompt_type'] != 'BASIC') | (df['attack_type'] != 'none') | (df['mitigation_type'] != 'none')]

print("Delta Analysis (Performance vs Baseline)")
print("=" * 40)
print(f"Experiments compared to baseline: {len(non_baseline)}")

# Count improvements vs degradations
mae_improvements = len(non_baseline[non_baseline['delta_overall_mae'] < 0])
mae_degradations = len(non_baseline[non_baseline['delta_overall_mae'] > 0])
jer_improvements = len(non_baseline[non_baseline['delta_overall_jer'] < 0])
jer_degradations = len(non_baseline[non_baseline['delta_overall_jer'] > 0])

print(f"\nMAE Improvements (negative delta): {mae_improvements}")
print(f"MAE Degradations (positive delta): {mae_degradations}")
print(f"JER Improvements (negative delta): {jer_improvements}")
print(f"JER Degradations (positive delta): {jer_degradations}")

# Best improvements
if not non_baseline.empty:
    best_mae_improvement = non_baseline[non_baseline['delta_overall_mae'] == non_baseline['delta_overall_mae'].min()]
    best_jer_improvement = non_baseline[non_baseline['delta_overall_jer'] == non_baseline['delta_overall_jer'].min()]
    
    print(f"\nBest MAE Improvement: {best_mae_improvement['delta_overall_mae'].iloc[0]:.4f}")
    print(f"  Experiment: {best_mae_improvement['prompt_type'].iloc[0]}, {best_mae_improvement['attack_type'].iloc[0]}, {best_mae_improvement['mitigation_type'].iloc[0]}")
    
    print(f"\nBest JER Improvement: {best_jer_improvement['delta_overall_jer'].iloc[0]:.2f}%")
    print(f"  Experiment: {best_jer_improvement['prompt_type'].iloc[0]}, {best_jer_improvement['attack_type'].iloc[0]}, {best_jer_improvement['mitigation_type'].iloc[0]}")


Delta Analysis (Performance vs Baseline)
Experiments compared to baseline: 47

MAE Improvements (negative delta): 2
MAE Degradations (positive delta): 45
JER Improvements (negative delta): 4
JER Degradations (positive delta): 43

Best MAE Improvement: -0.0681
  Experiment: BASIC, none, user_prompt_hardening

Best JER Improvement: -4.16%
  Experiment: BASIC, none, user_prompt_hardening
