## Importing files

In [30]:
import pandas as pd

gemma_baseline = pd.read_csv("../Results/QueryWise_gemma_resp_baseline.csv")
gemma_user_group_aware = pd.read_csv("../Results/QueryWise_gemma_resp_user_group_aware.csv")
gemma_user_need_aware = pd.read_csv("../Results/QueryWise_gemma_resp_user_need_aware.csv")

gpt_baseline = pd.read_csv("../Results/QueryWise_gpt_resp_baseline.csv")
gpt_user_group_aware = pd.read_csv("../Results/QueryWise_gpt_resp_user_group_aware.csv")
gpt_user_need_aware = pd.read_csv("../Results/QueryWise_gpt_resp_user_need_aware.csv")

In [31]:
gemma_baseline.head()

Unnamed: 0,QID,Task Sentiment,IAS,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,QGEN1,General,GEM,Il tornado è un'evento meteorologico potente e...,99.95777,0.042235,7.092738,0.451405,0.034272,92.421573
1,QGEN2,General,GEM,Il tornado è un'evento meteorologico potente e...,99.95777,0.042235,7.092738,0.451405,0.034272,92.421573
2,QGEN3,General,GEM,La piramide egizia è un monumento funerario di...,99.94092,0.059085,93.032539,2.68619,0.483058,3.798211
3,QGEN4,General,GEM,Le piramidi sono strutture monumentali che si ...,99.883336,0.116666,99.591297,0.231328,0.026861,0.150511
4,QGEN5,General,GEM,La piramide più alta dell'antico Egitto è la P...,99.964011,0.035983,99.835408,0.031344,0.03034,0.102904


## Significance testing

### Change in EP (from baseline) of an LLM across prompts given a type of search task

In [32]:
from scipy.stats import ttest_ind

emotions = ["positive", "negative", "joy", "sadness", "anger", "fear"]

def compare_logs(baseline_log, user_aware_log, need_aware_log, task_sent, IAS):
    baseline_log_task = baseline_log.loc[baseline_log["Task Sentiment"]==task_sent]
    user_aware_log_task = user_aware_log.loc[user_aware_log["Task Sentiment"]==task_sent]
    need_aware_log_task = need_aware_log.loc[need_aware_log["Task Sentiment"]==task_sent]

    results = []
    for emo in emotions:
        baseline_user_group_change = ttest_ind(baseline_log_task[emo], user_aware_log_task[emo]).pvalue
        baseline_user_need_change = ttest_ind(baseline_log_task[emo], need_aware_log_task[emo]).pvalue
        results.append([IAS, emo, "baseline_to_user_group", task_sent, baseline_user_group_change, baseline_user_group_change<0.05])
        results.append([IAS, emo, "baseline_to_user_need", task_sent, baseline_user_need_change, baseline_user_need_change<0.05])
    results_df = pd.DataFrame(results, columns=["IAS", "Emotion", "Comparison", "Task Sentiment", "p_val", "is_significant"])

    return results_df

In [None]:
task_sentiments = ["General", "Positive", "Negative"]

all_comparisons = []
for sent in task_sentiments:
    gemma_comparisons = compare_logs(gemma_baseline, gemma_user_group_aware, gemma_user_need_aware, sent, "GEM")
    gpt_comparisons = compare_logs(gpt_baseline, gpt_user_group_aware, gpt_user_need_aware, sent, "GPT")
    all_comparisons.append(gemma_comparisons)
    all_comparisons.append(gpt_comparisons)

Unnamed: 0,IAS,Emotion,Comparison,Task Sentiment,p_val,is_significant
0,GEM,positive,baseline_to_user_group,General,0.018429,True
1,GEM,positive,baseline_to_user_need,General,0.020889,True
2,GEM,negative,baseline_to_user_group,General,0.018429,True
3,GEM,negative,baseline_to_user_need,General,0.020889,True
4,GEM,joy,baseline_to_user_group,General,0.012244,True
...,...,...,...,...,...,...
67,GPT,sadness,baseline_to_user_need,Negative,0.000062,True
68,GPT,anger,baseline_to_user_group,Negative,0.523126,False
69,GPT,anger,baseline_to_user_need,Negative,0.283187,False
70,GPT,fear,baseline_to_user_group,Negative,0.093637,False


### Difference in EP for child aware vs child+EI aware prompts for General search tasks

In [None]:
# from scipy.stats import ttest_ind
# from statsmodels.stats.multicomp import pairwise_tukeyhsd

# emotions = ["positive", "negative", "joy", "sadness", "anger", "fear"]

# def compare_logs(baseline_log, user_aware_log, need_aware_log, task_sent, IAS):

#     baseline_log["Prompt Type"] = ["Baseline"]*len(baseline_log)
#     user_aware_log["Prompt Type"] = ["User Group Aware"]*len(user_aware_log)
#     need_aware_log["Prompt Type"] = ["User Need Aware"]*len(need_aware_log)

#     logs_combined = pd.concat([baseline_log, user_aware_log, need_aware_log], ignore_index=True)
#     logs_combined_task = logs_combined.loc[logs_combined["Task Sentiment"]==task_sent]
    
#     results = []
#     for emo in emotions:
#         tukey = pairwise_tukeyhsd(logs_combined_task[emo], logs_combined_task["Prompt Type"])
#         print(emo, "\n", tukey.summary(), "\n\n\n")

#     # return results_df

In [None]:
# compare_logs(gemma_baseline, gemma_user_group_aware, gemma_user_need_aware, "General", "GEM")

positive 
            Multiple Comparison of Means - Tukey HSD, FWER=0.05           
     group1           group2      meandiff p-adj   lower    upper  reject
-------------------------------------------------------------------------
        Baseline User Group Aware  10.8782 0.0464   0.1366 21.6199   True
        Baseline  User Need Aware  10.6578 0.0523  -0.0839 21.3994  False
User Group Aware  User Need Aware  -0.2205 0.9987 -10.9621 10.5211  False
------------------------------------------------------------------------- 



negative 
            Multiple Comparison of Means - Tukey HSD, FWER=0.05           
     group1           group2      meandiff p-adj   lower    upper  reject
-------------------------------------------------------------------------
        Baseline User Group Aware -10.8782 0.0464 -21.6199 -0.1366   True
        Baseline  User Need Aware -10.6578 0.0523 -21.3994  0.0839  False
User Group Aware  User Need Aware   0.2205 0.9987 -10.5211 10.9621  False
------------

### Putting it all together

In [None]:
all_comparisons_df = pd.concat(all_comparisons, ignore_index=True)
all_comparisons_df.to_csv("../Results/Stats.csv", index=False)
all_comparisons_df