In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from collections import Counter
import re

In [None]:
folder_path = 'projects/humansVsLLMs/results/qualtrics/surveys'    

In [None]:

# Raw BEST and WORST input data
best_raw = {
    "Accuracy": ["Female Leaders (3)", "DeepSeek (3)", "Gemini (3)"],
    "Relevance": ["Gemini (3)", "Qwen (2)"],
    "Currency": ["Gemini (2)", "Cohere (2)", "Female (1)", "Male (1)"],
    "Comprehensiveness": ["Gemini (4)", "GPT (3)"],
    "Agreement": ["Gemini (2)", "Male (2)"],
    "Usefulness": ["DeepSeek (5)"],
    "Clarity": ["Gemini (4)", "Cohere (2)", "Male (1)", "Female (1)"],
    "Empathy": ["Llama (4)"],
    "Bias": ["Cohere (3)", "Male (3)", "Llama (2)"],
    "Fabrication": ["Gemini (3)", "Mistral (2)", "GPT (2)"],
    "Trust": ["GPT (2)", "Gemini (2)", "Female (1)", "Male (1)"],
    "Satisfaction": ["Qwen (4)", "Cohere (2)", "DeepSeek (2)"]
}

worst_raw = {
    "Accuracy": ["Female (3)", "Male (2)", "DeepSeek (4)"],
    "Relevance": ["Female (4)", "Male (2)", "DeepSeek (2)"],
    "Currency": ["Female (3)", "Male (2)"],
    "Comprehensiveness": ["Male (6)"],
    "Agreement": ["Male (4)"],
    "Usefulness": ["Female (3)", "Qwen (2)", "Cohere (2)"],
    "Clarity": ["Female (3)", "Gemini (2)"],
    "Empathy": ["Male (3)", "Female (1)", "DeepSeek (2)", "Cohere (2)"],
    "Bias": ["Male (2)", "GPT (2)", "DeepSeek (2)", "Female (1)"],
    "Fabrication": ["Female (4)", "Male (1)", "Gemini (2)", "Cohere (2)"],
    "Trust": ["Male (3)", "Female (2)", "Mistral (2)"],
    "Satisfaction": ["Female (4)", "Male (2)", "Gemini (2)"]
}

# Step 2: Normalize all names
label_map = {
    "GPT": "gpt-4o-mini",
    "Gemini": "Gemini-2.0-Flash",
    "Cohere": "Command-a-03-2025",
    "Mistral": "Mistral-Large",
    "Llama": "Llama-3.3-70b",
    "DeepSeek": "DeepSeek-R1",
    "Qwen": "Qwen-Plus",
    "Female": "Female Leaders",
    "Female Leaders": "Female Leaders",
    "Male": "Male Leaders"
}

# Count frequencies
def count_mentions(raw_dict):
    counter = Counter()
    for items in raw_dict.values():
        for entry in items:
            match = re.match(r"([A-Za-z\s\-]+)(?:\s*\((\d+)\))?", entry.strip())
            if match:
                label = match.group(1).strip()
                label = label_map.get(label, label)
                count = int(match.group(2)) if match.group(2) else 1
                counter[label] += count
    return counter

best_counter = count_mentions(best_raw)
worst_counter = count_mentions(worst_raw)

# Combine into a DataFrame
all_labels = set(best_counter) | set(worst_counter)
data = []
for label in sorted(all_labels):
    best = best_counter.get(label, 0)
    worst = worst_counter.get(label, 0)
    total = best + worst if (best + worst) > 0 else 1
    data.append({
        "Source": label,
        "Best": best,
        "Worst": worst,
        "Best (%)": round(100 * best / total, 1),
        "Worst (%)": round(100 * worst / total, 1)
    })

df = pd.DataFrame(data).sort_values(by="Best (%)", ascending=False)

# Plot
plt.figure(figsize=(14, 6))
bar_width = 0.4
index = range(len(df))

plt.bar([i - bar_width/2 for i in index], df["Best"], width=bar_width, label="Best", color="green")
plt.bar([i + bar_width/2 for i in index], df["Worst"], width=bar_width, label="Worst", color="red")

plt.xticks(index, df["Source"], rotation=45, ha='right')
plt.ylabel("Percentage")
plt.title("Best vs Worst Evaluations by Source")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
data = {
    'Dimension': [
        'Accuracy', 'Relevance', 'Currency', 'Comprehensiveness', 'Agreement', 'Usefulness', 
        'Clarity', 'Empathy', 'Bias', 'Fabrication', 'Trust', 'Satisfaction'
    ],
    'Best': [
        'Female Leaders, DeepSeek, Gemini (3)', 
        'Gemini (3), Qwen (2)', 
        'Gemini (2), Cohere (2), Female Leaders (1), Male Leaders (1)', 
        'Gemini (4), GPT (3)', 
        'Gemini (2), Male Leaders (2)', 
        'DeepSeek (5)', 
        'Gemini (4), Cohere (2), Male Leaders (1), Female Leaders (1)', 
        'Llama (4)', 
        'Cohere (3), Male Leaders (3), Llama (2)', 
        'Gemini (3), Mistral (2), GPT (2)', 
        'GPT (2), Gemini (2), Female Leaders (1), Male Leaders (1)', 
        'Qwen (4), Cohere (2), DeepSeek (2)'
    ],
    'Worst': [
        'Female Leaders (3), Male Leaders (2), DeepSeek (4)', 
        'Female Leaders (4), Male Leaders (2), DeepSeek (4)', 
        'Female Leaders (3), Male Leaders (2)', 
        'Male Leaders (6)', 
        'Male Leaders (4)', 
        'Female Leaders (3), Qwen (2), Cohere (2)', 
        'Female Leaders (3), Gemini (2)', 
        'Male Leaders (3), Female Leaders (1), DeepSeek (2), Cohere (2)', 
        'Male Leaders (2), GPT (2), DeepSeek (2), Female Leaders (1)', 
        'Female Leaders (4), Male Leaders (1), Gemini (2), Cohere (2)', 
        'Male Leaders (3), Female Leaders (2), Mistral (2)', 
        'Female Leaders (4), Male Leaders (2), Gemini (2)'
    ]
}

df = pd.DataFrame(data)

# Helper function to extract model counts
def extract_counts(text):
    counts = {}
    for entry in text.split(','):
        entry = entry.strip()
        match = re.match(r"([A-Za-z\s\-]+)(?:\s*\((\d+)\))?", entry)
        if match:
            model = match.group(1).strip()
            count = int(match.group(2)) if match.group(2) else 1
            counts[model] = count
    return counts

# Flatten data into long format
records = []
for _, row in df.iterrows():
    best = extract_counts(row['Best'])
    worst = extract_counts(row['Worst'])
    dimension = row['Dimension']
    
    for model, count in best.items():
        records.append({'Dimension': dimension, 'Model': model, 'Score Type': 'Best', 'Count': count})
    for model, count in worst.items():
        records.append({'Dimension': dimension, 'Model': model, 'Score Type': 'Worst', 'Count': count})

plot_df = pd.DataFrame(records)
# plot_df['Model']

# Plot
plt.figure(figsize=(14, 8))
sns.barplot(data=plot_df, x='Count', y='Dimension', hue='Score Type', orient='h', errorbar=None)
# sns.catplot(
#     data=plot_df,
#     x="Count", y="Model", hue="Dimension",
#     kind="swarm", col="Score Type"
# )
plt.title('Best-Worst Scaling Results')
plt.xlabel('Number of Votes')
plt.ylabel('Dimension')
plt.legend(title='Score Type')
plt.tight_layout()
plt.show()


In [None]:
plot_df.to_csv('/humansVsLLMs/plots/qualtrics_results.csv', index=False)

In [None]:
# Data
data = {
    'Model': ['DeepSeek-R1', 'Gemini-2.0-Flash', 'Llama3.3-70b', 'Qwen-Plus', 'GPT-4o mini', 'Mistral-Large', 'Command-a-03-2025'],
    'Best Wins': [15, 19, 11, 10, 12, 6, 6],
    'Worst Losses': [10, 16, 9, 7, 14, 8, 14],
    'Net Score': [5, 3, 2, 3, -2, -2, -8],
    'Strongest Positive Dimensions': [
        'Usefulness, Trust, Accuracy',
        'Clarity, Relevance, Currency, Comprehensiveness',
        'Empathy, Satisfaction',
        'Satisfaction, Relevance',
        'Currency, Usefulness',
        'Accuracy',
        'Empathy, Coherence'
    ],
    'Biggest Weaknesses': [
        'Bias, Accuracy',
        'Fabrication, Bias',
        'Fabrication, Clarity',
        'Trust, Usefulness',
        'Fabrication, Bias',
        'Bias, Fabrication',
        'Bias, Fabrication, Usefulness'
    ]
}

df = pd.DataFrame(data)

# Melt for grouped barplot
df_melt = df.melt(id_vars='Model', value_vars=['Best Wins', 'Worst Losses', 'Net Score'], 
                  var_name='Metric', value_name='Count')

# Plot
plt.figure(figsize=(20, 20))
sns.barplot(data=df_melt, x='Model', y='Count', hue='Metric', palette='Set2')
plt.title('Best-Worst Scaling Results')
plt.xticks(rotation=45)
plt.tight_layout()

# Add annotations for strongest positives and weaknesses
for i, row in df.iterrows():
    plt.text(i - 0.3, max(df_melt['Count']) + 1, f"+ {row['Strongest Positive Dimensions']}", 
             fontsize=18, color='green', rotation=90)
    plt.text(i + 0.1, max(df_melt['Count']) + 1, f"- {row['Biggest Weaknesses']}", 
             fontsize=18, color='red', rotation=90)

plt.ylim(0, max(df_melt['Count']) + 10)
plt.ylabel('Percentage Share')
plt.legend(title='Metric')
plt.show()

In [None]:
# Prepare the data
data = {
    'Dimension': ['Accuracy', 'Relevance', 'Currency', 'Comprehensiveness', 
                 'Coherence', 'Usefulness', 'Clarity', 'Empathy', 'Bias', 
                 'Fabrication', 'Trust', 'Satisfaction'],
    'Best': [
        {'Female Leaders': 30, 'DeepSeek-R1': 30, 'Gemini-2.0-Flash': 30},
        {'Gemini-2.0-Flash': 30, 'Qwen-Plus': 20},
        {'Gemini-2.0-Flash': 20, 'Command-a-03-2025': 20, 'Female Leaders': 10, 'Male Leaders': 10},
        {'Gemini-2.0-Flash': 40, 'GPT-4o-mini': 30},
        {'Gemini-2.0-Flash': 20, 'Male Leaders': 20},
        {'DeepSeek-R1': 50},
        {'Gemini-2.0-Flash': 40, 'Command-a-03-2025': 20, 'Male Leaders': 10, 'Female Leaders': 10},
        {'Llama-3.3-70b': 40},
        {'Command-a-03-2025': 30, 'Male Leaders': 30, 'Llama-3.3-70b': 20},
        {'Gemini-2.0-Flash': 30, 'Mistral-Large': 20, 'GPT-4o-mini': 20},
        {'GPT-4o-mini': 20, 'Gemini-2.0-Flash': 20, 'Female Leaders': 10, 'Male Leaders': 10},
        {'Qwen-Plus': 40, 'Command-a-03-2025': 20, 'DeepSeek-R1': 20}
    ],
    'Worst': [
        {'Female Leaders': 30, 'Male Leaders': 20, 'DeepSeek-R1': 40},
        {'Female Leaders': 40, 'Male Leaders': 20, 'DeepSeek-R1': 40},
        {'Female Leaders': 30, 'Male Leaders': 20},
        {'Male Leaders': 60},
        {'Male Leaders': 40},
        {'Female Leaders': 30, 'Qwen-Plus': 20, 'Command-a-03-2025': 20},
        {'Female Leaders': 30, 'Gemini-2.0-Flash': 20},
        {'Male Leaders': 30, 'Female Leaders': 10, 'DeepSeek-R1': 20, 'Command-a-03-2025': 20},
        {'Male Leaders': 20, 'GPT-4o-mini': 20, 'DeepSeek-R1': 20, 'Female Leaders': 10},
        {'Female Leaders': 40, 'Male Leaders': 10, 'Gemini-2.0-Flash': 20, 'Command-a-03-2025': 20},
        {'Male Leaders': 30, 'Female Leaders': 20, 'Mistral-Large': 20},
        {'Female Leaders': 40, 'Male Leaders': 20, 'Gemini-2.0-Flash': 20}
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Extract all unique categories
all_categories = set()
for lst in df['Best']:
    all_categories.update(lst.keys())
for lst in df['Worst']:
    all_categories.update(lst.keys())
all_categories = sorted(all_categories)

# Create a colormap for the categories
colors = plt.cm.tab20.colors
cmap = {cat: colors[i % len(colors)] for i, cat in enumerate(all_categories)}

# Plotting
plt.figure(figsize=(14, 10))

bar_width = 0.35
index = np.arange(len(df))

# Plot Best ratings
bottom_best = np.zeros(len(df))
for category in all_categories:
    values = []
    for lst in df['Best']:
        values.append(lst.get(category, 0))
    if sum(values) > 0:  # Only plot if there are values
        plt.bar(index - bar_width/2, values, bar_width, 
                bottom=bottom_best, label=category, color=cmap[category])
        bottom_best += np.array(values)

# Plot Worst ratings
bottom_worst = np.zeros(len(df))
for category in all_categories:
    values = []
    for lst in df['Worst']:
        values.append(lst.get(category, 0))
    if sum(values) > 0:  # Only plot if there are values
        plt.bar(index + bar_width/2, values, bar_width, 
                bottom=bottom_worst, color=cmap[category])
        bottom_worst += np.array(values)

plt.xlabel('Dimensions')
plt.ylabel('Ratings Percentage')
plt.title('Best and Worst Ratings by Dimension')
plt.xticks(index, df['Dimension'], rotation=45, ha='right')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# # Add labels for total counts
# for i in range(len(df)):
#     total_best = sum(df.loc[i, 'Best'].values())
#     total_worst = sum(df.loc[i, 'Worst'].values())
#     plt.text(i - bar_width/2, bottom_best[i] + 0.5, str(total_best), ha='center')
#     plt.text(i + bar_width/2, bottom_worst[i] + 0.5, str(total_worst), ha='center')

plt.show()
# Create a balanced score DataFrame
score_data = []
for idx, row in df.iterrows():
    dimension = row['Dimension']
    best = row['Best']
    worst = row['Worst']
    
    for category in all_categories:
        best_score = best.get(category, 0)
        worst_score = worst.get(category, 0)
        net_score = best_score - worst_score
        if best_score != 0 or worst_score != 0:
            score_data.append([dimension, category, net_score])

score_df = pd.DataFrame(score_data, columns=['Dimension', 'Category', 'Net Score'])

# Pivot for heatmap
pivot_df = score_df.pivot(index='Dimension', columns='Category', values='Net Score')

# Plot heatmap
plt.figure(figsize=(12, 8))
plt.imshow(pivot_df.fillna(0), cmap='RdYlGn', aspect='auto')
plt.colorbar(label='Net Score (Worst - Best)')
plt.xticks(range(len(pivot_df.columns)), pivot_df.columns, rotation=45, ha='right')
plt.yticks(range(len(pivot_df.index)), pivot_df.index)
plt.title('Net Ratings by Dimension and Data Source')
plt.tight_layout()
plt.show()