In [None]:
import pandas as pd
import ast
from scipy.special import softmax
import matplotlib.pyplot as plt

model_names = ["LLaMA3.2-1B", "LLaMA3.2-3B", "Qwen2.5-1.5B", "Qwen2.5-3B"]

info_df = pd.read_csv("results.csv")
outputs_df = pd.read_csv("results-main.csv")

In [None]:
results_df = pd.concat([info_df, outputs_df], axis=1)
results_df.to_csv('complete_results.csv', index=None)

In [None]:
for name in model_names:
    results_df[f'{name}_context_weights'] = results_df[f'{name}_context_weights'].apply(ast.literal_eval)
    results_df[f'{name}_question_weights'] = results_df[f'{name}_question_weights'].apply(ast.literal_eval)
    results_df[f'{name}_context_ave'] = results_df[f'{name}_context_ave'].apply(ast.literal_eval)
    results_df[f'{name}_question_ave'] = results_df[f'{name}_question_ave'].apply(ast.literal_eval)

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

scaler = StandardScaler()

def z_scale(data):
    data = np.array(data, dtype='float32').reshape(-1, 1)
    return scaler.fit_transform(data).reshape(1, -1)

def get_similarity(llm_attn, human_attn):
    normalized_llm = z_scale(llm_attn)
    normalized_human = z_scale(human_attn)
    return cosine_similarity(normalized_llm, normalized_human)[0][0]


In [None]:
for name in model_names:
    results_df[f'{name}_context_similarity'] = results_df.apply(lambda row: get_similarity(row[f'{name}_context_ave'], row[f'{name}_context_weights']), axis=1)
    results_df[f'{name}_question_similarity'] = results_df.apply(lambda row: get_similarity(row[f'{name}_question_ave'], row[f'{name}_question_weights']), axis=1)
    results_df[f'{name}_context_length'] = results_df['context'].apply(lambda x: len(x))
    results_df[f'{name}_question_length'] = results_df['question'].apply(lambda x: len(x))

In [None]:
context_corrs = [0, 0, 0, 0]
question_corrs = [0, 0, 0, 0]

for i in range(len(model_names)):
    name = model_names[i]
    context_corrs[i] = results_df[f'{name}_context_similarity'].corr(results_df[f'{name}_context_length'])
    question_corrs[i] = results_df[f'{name}_question_similarity'].corr(results_df[f'{name}_question_length'])

In [None]:
# Get similarity average for all models
similarities = [0, 0, 0, 0]
for i in range(len(model_names)):
    name = model_names[i]
    similarities[i] = (results_df[f'{name}_context_similarity'].mean(), results_df[f'{name}_question_similarity'].mean())
    print(similarities[i])

In [None]:
cols = []
labels = []
for name in model_names:
    cols.append(f'{name}_context_similarity')
    labels.append(name)

plot = results_df.boxplot(column=cols)
plot.set_xticklabels(labels)
plt.title('Context Attention Cosine Similarity Scores between LLMs and Humans')
plt.show()

In [None]:
cols = []
labels = []
for name in model_names:
    cols.append(f'{name}_question_similarity')
    labels.append(name)

plot = results_df.boxplot(column=cols)
plot.set_xticklabels(labels)
plt.title('Question Attention Cosine Similarity Scores between LLMs and Humans')
plt.show()

In [None]:
# Get all grouped types
types = [0, 0, 0, 0]
for i in range(4):
    types[i] = results_df[results_df['type'] == i]

In [None]:
# Get average attention and similarity to humans on context and question for all types for each model
type_data = [pd.DataFrame() for _ in range(4)]

for i in range(4):
    for name in model_names:
        type_data[i][f'{name}_context_attention_{i}'] = types[i][f'{name}_context_ave'].apply(lambda x: sum(x) / len(x))
        type_data[i][f'{name}_question_attention_{i}'] = types[i][f'{name}_question_ave'].apply(lambda x: sum(x) / len(x))
        type_data[i][f'{name}_context_similarity_{i}'] = types[i][f'{name}_context_similarity']
        type_data[i][f'{name}_question_similarity_{i}'] = types[i][f'{name}_question_similarity']
        

In [None]:
type_summary = pd.DataFrame(columns=['type', 'model', 'avg_context_attn', 'avg_question_attn', 'avg_context_similarity', 'avg_question_similarity'])

for i in range(4):
    for name in model_names:
        type_summary.loc[len(type_summary)] = [i, name, type_data[i][f'{name}_context_attention_{i}'].mean(), 
                                               type_data[i][f'{name}_question_attention_{i}'].mean(), type_data[i][f'{name}_context_similarity_{i}'].mean(), 
                                               type_data[i][f'{name}_question_similarity_{i}'].mean()]

type_summary.sort_values(by=['model', 'avg_context_attn'], ascending=[True, False])

In [None]:
model_data = [0, 0, 0, 0]
for x in range(len(model_names)):
    name = model_names[x]
    model_data[x] = pd.concat([type_data[t][[f'{name}_context_attention_{t}', f'{name}_question_attention_{t}', f'{name}_context_similarity_{t}', f'{name}_question_similarity_{t}']].reset_index(drop=True) for t in range(len(type_data))], axis=1)

In [None]:
cols = []
labels = []

fig, axes = plt.subplots(2, 2, figsize=(7, 7))

for x in range(len(model_names)):
    name = model_names[x]
    cols.append([])
    labels.append([])
    for i in range(4):
        cols[x].append(f'{name}_context_attention_{i}')
        labels[x].append(f'Type {i}')
    model_data[x].boxplot(column=cols[x], ax=axes[x//2, x%2])
    axes[x//2, x%2].set_title(model_names[x])
    axes[x//2, x%2].set_xticklabels(labels[x])
    
plt.suptitle('Context Attention for Different Context-Question Pair Types')
plt.tight_layout()
plt.show()

In [None]:
cols = []
labels = []

fig, axes = plt.subplots(2, 2, figsize=(7, 7))

for x in range(len(model_names)):
    name = model_names[x]
    cols.append([])
    labels.append([])
    for i in range(4):
        cols[x].append(f'{name}_question_attention_{i}')
        labels[x].append(f'Type {i}')
    model_data[x].boxplot(column=cols[x], ax=axes[x//2, x%2])
    axes[x//2, x%2].set_title(model_names[x])
    axes[x//2, x%2].set_xticklabels(labels[x])
    
plt.suptitle('Question Attention for Different Context-Question Pair Types')    
plt.tight_layout()
plt.show()

In [None]:
cols = []
labels = []

fig, axes = plt.subplots(2, 2, figsize=(7, 7))

for x in range(len(model_names)):
    name = model_names[x]
    cols.append([])
    labels.append([])
    for i in range(4):
        cols[x].append(f'{name}_context_similarity_{i}')
        labels[x].append(f'Type {i}')
    model_data[x].boxplot(column=cols[x], ax=axes[x//2, x%2])
    axes[x//2, x%2].set_title(model_names[x])
    axes[x//2, x%2].set_xticklabels(labels[x])

plt.suptitle('Context Attention Cosine Similarity for Different Context-Question Pair Types')    
plt.tight_layout()
plt.show()

In [None]:
cols = []
labels = []

fig, axes = plt.subplots(2, 2, figsize=(7, 7))

for x in range(len(model_names)):
    name = model_names[x]
    cols.append([])
    labels.append([])
    for i in range(4):
        cols[x].append(f'{name}_question_similarity_{i}')
        labels[x].append(f'Type {i}')
    model_data[x].boxplot(column=cols[x], ax=axes[x//2, x%2])
    axes[x//2, x%2].set_title(model_names[x])
    axes[x//2, x%2].set_xticklabels(labels[x])

plt.suptitle('Question Attention Cosine Similarity for Different Context-Question Pair Types')   
plt.tight_layout()
plt.show()

In [None]:
#Filtering to remove model hallucinations
import pandas as pd
import ast
from scipy.special import softmax

annotated_results_df = pd.read_csv('annotated_complete_results.csv')

In [None]:
name = "Qwen2.5-1.5B"
filtered_df = annotated_results_df[annotated_results_df[f'{name}_hallucinated'] == False]
filtered_df

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

scaler = StandardScaler()

def z_scale(data):
    data = np.array(data, dtype='float32').reshape(-1, 1)
    return scaler.fit_transform(data).reshape(1, -1)

def get_similarity(llm_attn, human_attn):
    normalized_llm = z_scale(llm_attn)
    normalized_human = z_scale(human_attn)
    return cosine_similarity(normalized_llm, normalized_human)[0][0]


In [None]:
filtered_df[f'{name}_context_weights'] = filtered_df[f'{name}_context_weights'].apply(ast.literal_eval)
filtered_df[f'{name}_question_weights'] = filtered_df[f'{name}_question_weights'].apply(ast.literal_eval)
filtered_df[f'{name}_context_ave'] = filtered_df[f'{name}_context_ave'].apply(ast.literal_eval)
filtered_df[f'{name}_question_ave'] = filtered_df[f'{name}_question_ave'].apply(ast.literal_eval)

In [None]:
filtered_df[f'{name}_context_similarity'] = filtered_df.apply(lambda row: get_similarity(row[f'{name}_context_ave'], row[f'{name}_context_weights']), axis=1)
filtered_df[f'{name}_question_similarity'] = filtered_df.apply(lambda row: get_similarity(row[f'{name}_question_ave'], row[f'{name}_question_weights']), axis=1)

In [None]:
# Get similarity average for non-hallucinated responses
model = (filtered_df[f'{name}_context_similarity'].mean(), filtered_df[f'{name}_question_similarity'].mean())
print(f'{name}: {model}')

In [None]:
labels = []
for name in model_names:
    labels.append(name)

plt.boxplot([results_df[f'{model_names[0]}_context_similarity'], results_df[f'{model_names[1]}_context_similarity'],
            filtered_df[f'{model_names[2]}_context_similarity'], results_df[f'{model_names[3]}_context_similarity']], labels=labels)
plt.title('Context Attention Cosine Similarity Scores between LLMs and Humans')
plt.grid(True)
plt.show()

In [None]:
plt.boxplot([results_df[f'{model_names[0]}_question_similarity'], results_df[f'{model_names[1]}_question_similarity'],
            filtered_df[f'{model_names[2]}_question_similarity'], results_df[f'{model_names[3]}_question_similarity']], labels=labels)
plt.title('Question Attention Cosine Similarity Scores between LLMs and Humans')
plt.grid(True)
plt.show()

In [None]:
name = model_names[2]

In [None]:
# Get all grouped types
types = [0, 0, 0, 0]
for i in range(4):
    types[i] = filtered_df[filtered_df['type'] == i]

In [None]:
type_data = [pd.DataFrame() for _ in range(4)]

for i in range(4):
    type_data[i][f'{name}_context_attention_{i}'] = types[i][f'{name}_context_ave'].apply(lambda x: sum(x) / len(x))
    type_data[i][f'{name}_question_attention_{i}'] = types[i][f'{name}_question_ave'].apply(lambda x: sum(x) / len(x))
    type_data[i][f'{name}_context_similarity_{i}'] = types[i][f'{name}_context_similarity']
    type_data[i][f'{name}_question_similarity_{i}'] = types[i][f'{name}_question_similarity']


In [None]:
type_summary = pd.DataFrame(columns=['type', 'model', 'avg_context_attn', 'avg_question_attn', 'avg_context_similarity', 'avg_question_similarity'])

for i in range(4):
    type_summary.loc[len(type_summary)] = [i, name, type_data[i][f'{name}_context_attention_{i}'].mean(), 
                                            type_data[i][f'{name}_question_attention_{i}'].mean(), type_data[i][f'{name}_context_similarity_{i}'].mean(), 
                                            type_data[i][f'{name}_question_similarity_{i}'].mean()]

type_summary.sort_values(by=['model', 'avg_context_attn'], ascending=[True, False])

In [None]:
name = model_names[2]
model_data = pd.concat([type_data[t][[f'{name}_context_attention_{t}', f'{name}_question_attention_{t}', f'{name}_context_similarity_{t}', f'{name}_question_similarity_{t}']].reset_index(drop=True) for t in range(len(type_data))], axis=1)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(7, 3))

cols = []
labels = []

for i in range(4):
    cols.append(f'{name}_context_attention_{i}')
    labels.append(f'Type {i}')
model_data.boxplot(column=cols, ax=axes[0])
axes[0].set_title('Context Attention')
axes[0].set_xticklabels(labels)

cols = []
labels = []

for i in range(4):
    cols.append(f'{name}_question_attention_{i}')
    labels.append(f'Type {i}')
model_data.boxplot(column=cols, ax=axes[1])
axes[1].set_title('Question Attention')
axes[1].set_xticklabels(labels)

plt.suptitle('Qwen2.5-1.5B Non-Hallucinated Data')
plt.tight_layout()
plt.show()

In [None]:
name = "Qwen2.5-1.5B"
hallucinated_df = annotated_results_df[annotated_results_df[f'{name}_hallucinated'] == True]

In [None]:
hallucinated_df[f'{name}_context_weights'] = hallucinated_df[f'{name}_context_weights'].apply(ast.literal_eval)
hallucinated_df[f'{name}_question_weights'] = hallucinated_df[f'{name}_question_weights'].apply(ast.literal_eval)
hallucinated_df[f'{name}_context_ave'] = hallucinated_df[f'{name}_context_ave'].apply(ast.literal_eval)
hallucinated_df[f'{name}_question_ave'] = hallucinated_df[f'{name}_question_ave'].apply(ast.literal_eval)

In [None]:
hallucinated_df[f'{name}_context_similarity'] = hallucinated_df.apply(lambda row: get_similarity(row[f'{name}_context_ave'], row[f'{name}_context_weights']), axis=1)
hallucinated_df[f'{name}_question_similarity'] = hallucinated_df.apply(lambda row: get_similarity(row[f'{name}_question_ave'], row[f'{name}_question_weights']), axis=1)

In [None]:
# Get similarity average for hallucinated responses
model = (hallucinated_df[f'{name}_context_similarity'].mean(), hallucinated_df[f'{name}_question_similarity'].mean())
print(f'{name}: {model}')

In [None]:
# Get all grouped types
hallu_types = [0, 0, 0, 0]
for i in range(4):
    hallu_types[i] = hallucinated_df[hallucinated_df['type'] == i]

In [None]:
hallu_type_data = [pd.DataFrame() for _ in range(4)]

for i in range(4):
    hallu_type_data[i][f'{name}_context_attention'] = hallu_types[i][f'{name}_context_ave'].apply(lambda x: sum(x) / len(x))
    hallu_type_data[i][f'{name}_question_attention'] = hallu_types[i][f'{name}_question_ave'].apply(lambda x: sum(x) / len(x))
    hallu_type_data[i][f'{name}_context_similarity'] = hallu_types[i][f'{name}_context_similarity']
    hallu_type_data[i][f'{name}_question_similarity'] = hallu_types[i][f'{name}_question_similarity']


In [None]:
hallu_type_summary = pd.DataFrame(columns=['type', 'model', 'avg_context_attn', 'avg_question_attn', 'avg_context_similarity', 'avg_question_similarity'])

for i in range(4):
    if hallu_types[i].shape[0] == 0:
        continue

    hallu_type_summary.loc[len(hallu_type_summary)] = [i, name, hallu_type_data[i][f'{name}_context_attention'].mean(), 
                                            hallu_type_data[i][f'{name}_question_attention'].mean(), hallu_types[i][f'{name}_context_similarity'].mean(), 
                                            hallu_types[i][f'{name}_question_similarity'].mean()]

hallu_type_summary.sort_values(by=['model', 'avg_context_similarity'], ascending=[True, False])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 7))

cols = []
labels = ['Type 1 (Hallucinated)', 'Type 1', 'Type 3 (Hallucinated)', 'Type 3']

for i in [1, 3]:
    cols.append(hallu_type_data[i][f'{name}_context_attention'])
    cols.append(model_data[f'{name}_context_attention_{i}'].dropna())
axes[0].boxplot(cols, labels=labels)
axes[0].set_title('Context Attention')

cols = []
for i in [1, 3]:
    cols.append(hallu_type_data[i][f'{name}_question_attention'])
    cols.append(model_data[f'{name}_question_attention_{i}'].dropna())
axes[1].boxplot(cols, labels=labels)
axes[1].set_title('Question Attention')

plt.suptitle('Qwen2.5-1.5B Hallucinated vs Non-Hallucinated Average Attentions')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 7))

cols = []
for i in [1, 3]:
    cols.append(hallu_type_data[i][f'{name}_context_similarity'])
    cols.append(model_data[f'{name}_context_similarity_{i}'].dropna())
axes[0].boxplot(cols, labels=labels)
axes[0].set_title('Context Similarity')

cols = []
for i in [1, 3]:
    cols.append(hallu_type_data[i][f'{name}_question_similarity'])
    cols.append(model_data[f'{name}_question_similarity_{i}'].dropna())
axes[1].boxplot(cols, labels=labels)
axes[1].set_title('Question Similarity')

plt.suptitle('Qwen2.5-1.5B Hallucinated vs Non-Hallucinated Cosine Similarities')
plt.tight_layout()
plt.show()