In [38]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from mpmath import mp # To obtain order of magnitude regardless of how small.

In [39]:
data_pct_results = pd.read_csv('../data/results_pct/pct_results.csv')
data_pct_results = data_pct_results[((data_pct_results['additional_context_placement'] == 'user-beginning') | (data_pct_results['additional_context_placement'] == 'base')) & (data_pct_results['jailbreak_option'] != 'jail-01')].reset_index()
data_pct_results = data_pct_results.drop(columns=['index', 'additional_context_placement'])
prompt_to_id = {prop: idx for idx, prop in enumerate(data_pct_results['prompt'].unique())}
data_pct_results['prompt_id'] = data_pct_results['prompt'].map(prompt_to_id)
data_pct_results['additional_context_key'] = data_pct_results['additional_context_key'].astype('category')
data_pct_results['jailbreak_option'] = data_pct_results['jailbreak_option'].astype('category')
data_pct_results['prompt_id'] = data_pct_results['prompt_id'].astype('category')

In [40]:
all_additional_contexts = data_pct_results['additional_context_key'].unique().tolist()
all_additional_contexts = [x for x in all_additional_contexts if str(x) != 'base']
all_jailbreak_options = data_pct_results['jailbreak_option'].unique().tolist() # Consider removing jail-01 as no technically jail-break so not comparable with the others (?)

### LMM
`score ~ additional_context_key + (1 | jailbreak_option) + (1 | prompt)`

In [41]:
def run_wilcoxon_benjamini_yekutieli(actual_values, p_values, round_value_to=3, round_pvalue_to=4):
    """Apply Benjamini-Hochberg FDR correction (controls false discovery rate)"""
    fixed_effects_with_pvalues = {
        key: (actual_values[key], p_values[key])
        for key in actual_values.index
    }
    fixed_effects_with_pvalues = sorted(fixed_effects_with_pvalues.items(), key=lambda x: x[1][1])
    fdr = 0.05
    n = len(fixed_effects_with_pvalues)
    harmonic_number = sum([1 / i for i in range(1, n + 1)])
    k_max = 0
    for i in range(n):
        if fixed_effects_with_pvalues[i][1][1] <= (i + 1) / (n * harmonic_number) * fdr:
            k_max = i + 1
    
    # Display results
    for i in range(n):
        key, (actual_value, pvalue) = fixed_effects_with_pvalues[i]
        key_formatted = key.replace('C(additional_context_key)[', '').replace(']', '')
        if i < k_max:
            print(f"context {key_formatted}: coeff {round(actual_value, round_value_to)} - {round(pvalue, round_pvalue_to)}*")
        else:
            print(f"context {key_formatted}: coeff {round(actual_value, round_value_to)} - {round(pvalue, round_pvalue_to)}")

In [42]:
def run_lmm_social_economic(data_pct_results_model, return_model=False):
    """
    Run LMM for economic and social scores."
    """ 
    # Fit full LMM for economic score
    model_economic = smf.mixedlm(
        "economic ~ C(additional_context_key)",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"],                         # By default there is a random intercept for each group.
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"}       # Needed since statmodels lmm does not allow multiple standard random effects -> Works same way
    )
    result_economic = model_economic.fit(reml=False)
    # Fit full and reduced LMM w.o. fixed effexts
    model_economic_nofixed = smf.mixedlm(
        "economic ~ 1",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"], 
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"}
    )
    result_economic_nofixed = model_economic_nofixed.fit(reml=False)

    # Fit full and reduced LMM for social score
    model_social = smf.mixedlm(
        "social ~ C(additional_context_key)",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"],              
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"} 
    )
    result_social = model_social.fit(reml=False)
    model_social_nofixed = smf.mixedlm(
        "social ~ 1",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"], 
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"}
    )
    result_social_nofixed = model_social_nofixed.fit(reml=False)

    # Perform LRT on economic scores to test RQ1
    lr_stat_economic = 2 * (result_economic.llf - result_economic_nofixed.llf)
    df_diff = len(result_economic.params) - len(result_economic_nofixed.params)

    mp.dps = 500
    p_value_economic = 1- mp.gammainc(df_diff / 2, 0, lr_stat_economic / 2) / mp.gamma(df_diff / 2)
    print(f"Economic model LRT p-value: {mp.nstr(p_value_economic, n=3, min_fixed=-1, max_fixed=0)}")  # Scientific notation with 2 decimal digits
    print(f"-----------------------------")

    # Perform LRT on social scores to test RQ1
    lr_stat_social = 2 * (result_social.llf - result_social_nofixed.llf)
    df_diff = len(result_social.params) - len(result_social_nofixed.params)

    mp.dps = 500
    p_value_social = 1- mp.gammainc(df_diff / 2, 0, lr_stat_social / 2) / mp.gamma(df_diff / 2)
    print(f"\nSocial model LRT p-value: {mp.nstr(p_value_social, n=3, min_fixed=-1, max_fixed=0)}")  # Scientific notation with 2 decimal digits
    print(f"-----------------------------")

    # Compute Wald test for each additional context coefficient.
    fixed_effect_actual_values_economic = result_economic.params.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_p_values_economic = result_economic.pvalues.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_actual_values_social = result_social.params.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_p_values_social = result_social.pvalues.filter(like='C(additional_context_key)', axis=0)

    print(f"\n\nEconomic fixed effects:")
    run_wilcoxon_benjamini_yekutieli(fixed_effect_actual_values_economic, fixed_effect_p_values_economic)
    print(f"-----------------------------")
    print(f"\n\nSocial fixed effects:")
    run_wilcoxon_benjamini_yekutieli(fixed_effect_actual_values_social, fixed_effect_p_values_social)

    if return_model:
        return result_economic, result_social

In [43]:
def run_lmm_social_economic(data_pct_results_model, return_model=False):
    """
    Run LMM for economic and social scores."
    """ 
    # Fit full LMM for economic score
    model_economic = smf.mixedlm(
        "economic ~ C(additional_context_key)",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"],                         
        re_formula="1",    
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"} 
    )
    result_economic = model_economic.fit(reml=False)
    # Fit full and reduced LMM w.o. fixed effexts
    model_economic_nofixed = smf.mixedlm(
        "economic ~ 1",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"], 
        re_formula="1",    
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"}
    )
    result_economic_nofixed = model_economic_nofixed.fit(reml=False)

    # Fit full and reduced LMM for social score
    model_social = smf.mixedlm(
        "social ~ C(additional_context_key)",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"],  
        re_formula="1",                
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"} 
    )
    result_social = model_social.fit(reml=False)
    model_social_nofixed = smf.mixedlm(
        "social ~ 1",
        data=data_pct_results_model,
        groups=data_pct_results_model["prompt"],
        re_formula="1",     
        vc_formula={"jailbreak_option": "0 + C(jailbreak_option)"}
    )
    result_social_nofixed = model_social_nofixed.fit(reml=False)

    # Perform LRT on economic scores to test RQ1
    lr_stat_economic = 2 * (result_economic.llf - result_economic_nofixed.llf)
    df_diff = len(result_economic.params) - len(result_economic_nofixed.params)

    mp.dps = 500
    p_value_economic = 1- mp.gammainc(df_diff / 2, 0, lr_stat_economic / 2) / mp.gamma(df_diff / 2)
    print(f"Economic model LRT p-value: {mp.nstr(p_value_economic, n=3, min_fixed=-1, max_fixed=0)}") 
    print(f"-----------------------------")

    # Perform LRT on social scores to test RQ1
    lr_stat_social = 2 * (result_social.llf - result_social_nofixed.llf)
    df_diff = len(result_social.params) - len(result_social_nofixed.params)

    mp.dps = 500
    p_value_social = 1- mp.gammainc(df_diff / 2, 0, lr_stat_social / 2) / mp.gamma(df_diff / 2)
    print(f"\nSocial model LRT p-value: {mp.nstr(p_value_social, n=3, min_fixed=-1, max_fixed=0)}") 
    print(f"-----------------------------")

    # Compute Wald test for each additional context coefficient.
    fixed_effect_actual_values_economic = result_economic.params.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_p_values_economic = result_economic.pvalues.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_actual_values_social = result_social.params.filter(like='C(additional_context_key)', axis=0)
    fixed_effect_p_values_social = result_social.pvalues.filter(like='C(additional_context_key)', axis=0)

    print(f"\n\nEconomic fixed effects:")
    run_wilcoxon_benjamini_yekutieli(fixed_effect_actual_values_economic, fixed_effect_p_values_economic)
    print(f"-----------------------------")
    print(f"\n\nSocial fixed effects:")
    run_wilcoxon_benjamini_yekutieli(fixed_effect_actual_values_social, fixed_effect_p_values_social)

    if return_model:
        return result_economic, result_social

In [44]:
model_name = 'Meta-Llama-3.1-8B-Instruct-abliterated'
data_pct_results_model = data_pct_results[data_pct_results['model_id'] == model_name].reset_index()
llama_instruct_abliterated_model_economic, llama_instruct_abliterated_model_social = run_lmm_social_economic(data_pct_results_model, return_model=True)

Economic model LRT p-value: 2.53e-112
-----------------------------

Social model LRT p-value: 7.69e-122
-----------------------------


Economic fixed effects:
context T.wiki_pol_trump: coeff 5.252 - 0.0*
context T.wiki_pol_bush: coeff 2.393 - 0.0*
context T.wiki_pol_hwbush: coeff 1.965 - 0.0*
context T.wiki_pol_obama: coeff -1.583 - 0.0*
context T.wiki_pol_biden: coeff -1.297 - 0.0*
context T.wiki_obj_table: coeff 1.192 - 0.0*
context T.wiki_obj_bottle: coeff 1.071 - 0.0002*
context T.wiki_obj_cup: coeff 1.024 - 0.0004*
context T.wiki_mus_classical: coeff 1.007 - 0.0005*
context T.wiki_obj_plate: coeff 0.905 - 0.0018*
context T.wiki_mus_heavy-metal: coeff 0.768 - 0.0081*
context T.wiki_obj_sink: coeff 0.68 - 0.019
context T.wiki_mus_gospel: coeff 0.58 - 0.0455
context T.wiki_obj_chair: coeff 0.547 - 0.0595
context T.wiki_mus_reggae: coeff -0.157 - 0.5876
context T.wiki_mus_hip-hop: coeff -0.07 - 0.8086
context T.wiki_mus_jazz: coeff 0.067 - 0.8166
context T.wiki_pol_clinton: coeff -0

In [33]:
llama_instruct_abliterated_model_economic.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,economic
No. Observations:,760,Method:,ML
No. Groups:,10,Scale:,1.6814
Min. group size:,76,Log-Likelihood:,-1317.9700
Max. group size:,76,Converged:,Yes
Mean group size:,76.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-4.394,0.340,-12.912,0.000,-5.061,-3.727
C(additional_context_key)[T.wiki_mus_classical],1.007,0.290,3.474,0.001,0.439,1.576
C(additional_context_key)[T.wiki_mus_gospel],0.580,0.290,2.000,0.045,0.012,1.148
C(additional_context_key)[T.wiki_mus_heavy-metal],0.768,0.290,2.647,0.008,0.199,1.336
C(additional_context_key)[T.wiki_mus_hip-hop],-0.070,0.290,-0.242,0.809,-0.639,0.498
C(additional_context_key)[T.wiki_mus_jazz],0.067,0.290,0.232,0.817,-0.501,0.636
C(additional_context_key)[T.wiki_mus_reggae],-0.157,0.290,-0.542,0.588,-0.726,0.411
C(additional_context_key)[T.wiki_obj_bottle],1.071,0.290,3.693,0.000,0.502,1.639
C(additional_context_key)[T.wiki_obj_chair],0.547,0.290,1.885,0.059,-0.022,1.115


In [34]:
llama_instruct_abliterated_model_social.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,social
No. Observations:,760,Method:,ML
No. Groups:,10,Scale:,0.6235
Min. group size:,76,Log-Likelihood:,-948.8456
Max. group size:,76,Converged:,Yes
Mean group size:,76.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.340,0.194,-32.602,0.000,-6.721,-5.959
C(additional_context_key)[T.wiki_mus_classical],0.790,0.177,4.476,0.000,0.444,1.136
C(additional_context_key)[T.wiki_mus_gospel],1.455,0.177,8.241,0.000,1.109,1.801
C(additional_context_key)[T.wiki_mus_heavy-metal],0.474,0.177,2.685,0.007,0.128,0.820
C(additional_context_key)[T.wiki_mus_hip-hop],-0.324,0.177,-1.834,0.067,-0.670,0.022
C(additional_context_key)[T.wiki_mus_jazz],-0.172,0.177,-0.971,0.331,-0.518,0.175
C(additional_context_key)[T.wiki_mus_reggae],-0.243,0.177,-1.378,0.168,-0.589,0.103
C(additional_context_key)[T.wiki_obj_bottle],0.347,0.177,1.968,0.049,0.001,0.694
C(additional_context_key)[T.wiki_obj_chair],0.269,0.177,1.524,0.128,-0.077,0.615


In [8]:
model_name = 'Llama-3.1-8B-Instruct'
data_pct_results_model = data_pct_results[data_pct_results['model_id'] == model_name].reset_index()
run_lmm_social_economic(data_pct_results_model)

Economic model LRT p-value: 6.43e-55
-----------------------------

Social model LRT p-value: 4.18e-93
-----------------------------


Economic fixed effects:
context T.wiki_pol_hwbush: coeff 3.981 - 0.0*
context T.wiki_pol_trump: coeff 3.721 - 0.0*
context T.wiki_pol_bush: coeff 3.671 - 0.0*
context T.wiki_obj_table: coeff 1.884 - 0.0*
context T.wiki_obj_sink: coeff 1.737 - 0.0*
context T.wiki_pol_biden: coeff 1.634 - 0.0*
context T.wiki_mus_gospel: coeff 1.565 - 0.0*
context T.wiki_obj_bottle: coeff 1.562 - 0.0*
context T.wiki_obj_cup: coeff 1.328 - 0.0002*
context T.wiki_obj_chair: coeff 1.166 - 0.001*
context T.wiki_mus_jazz: coeff 1.072 - 0.0024*
context T.wiki_mus_heavy-metal: coeff 1.071 - 0.0025*
context T.wiki_pol_clinton: coeff 0.974 - 0.0059*
context T.wiki_obj_plate: coeff 0.828 - 0.0192
context T.wiki_mus_hip-hop: coeff 0.615 - 0.0819
context T.wiki_mus_classical: coeff 0.59 - 0.0955
context T.wiki_mus_reggae: coeff 0.287 - 0.417
context T.wiki_pol_obama: coeff 0.256 - 0.4

In [9]:
model_name = 'Mistral-7B-Instruct-v0.3'
data_pct_results_model = data_pct_results[data_pct_results['model_id'] == model_name].reset_index()
run_lmm_social_economic(data_pct_results_model)

Economic model LRT p-value: 5.7e-62
-----------------------------

Social model LRT p-value: 9.92e-97
-----------------------------


Economic fixed effects:
context T.wiki_pol_trump: coeff 2.315 - 0.0*
context T.wiki_pol_bush: coeff 1.855 - 0.0*
context T.wiki_pol_hwbush: coeff 1.414 - 0.0*
context T.wiki_mus_reggae: coeff -1.267 - 0.0*
context T.wiki_mus_hip-hop: coeff -1.145 - 0.0*
context T.wiki_mus_gospel: coeff -0.805 - 0.0028*
context T.wiki_obj_plate: coeff -0.583 - 0.0306
context T.wiki_mus_jazz: coeff -0.527 - 0.0507
context T.wiki_mus_classical: coeff -0.482 - 0.0738
context T.wiki_mus_heavy-metal: coeff -0.476 - 0.0773
context T.wiki_pol_biden: coeff -0.455 - 0.0916
context T.wiki_obj_chair: coeff -0.307 - 0.2549
context T.wiki_obj_cup: coeff -0.307 - 0.2553
context T.wiki_obj_sink: coeff 0.255 - 0.3434
context T.wiki_pol_obama: coeff -0.204 - 0.4489
context T.wiki_obj_table: coeff -0.195 - 0.4698
context T.wiki_pol_clinton: coeff 0.127 - 0.6367
context T.wiki_obj_bottle: c

In [10]:
model_name = 'Mistral-7B-Instruct-v0.3-abliterated'
data_pct_results_model = data_pct_results[data_pct_results['model_id'] == model_name].reset_index()
run_lmm_social_economic(data_pct_results_model)

Economic model LRT p-value: 1.81e-71
-----------------------------

Social model LRT p-value: 4.44e-98
-----------------------------


Economic fixed effects:
context T.wiki_pol_trump: coeff 2.574 - 0.0*
context T.wiki_pol_bush: coeff 2.294 - 0.0*
context T.wiki_pol_hwbush: coeff 1.656 - 0.0*
context T.wiki_mus_reggae: coeff -1.353 - 0.0*
context T.wiki_mus_hip-hop: coeff -0.994 - 0.0002*
context T.wiki_mus_gospel: coeff -0.741 - 0.006
context T.wiki_mus_jazz: coeff -0.544 - 0.0436
context T.wiki_mus_classical: coeff -0.529 - 0.05
context T.wiki_pol_clinton: coeff 0.415 - 0.124
context T.wiki_mus_heavy-metal: coeff -0.351 - 0.193
context T.wiki_obj_bottle: coeff 0.24 - 0.3725
context T.wiki_obj_table: coeff -0.191 - 0.4782
context T.wiki_pol_biden: coeff -0.162 - 0.548
context T.wiki_obj_plate: coeff -0.141 - 0.6011
context T.wiki_pol_obama: coeff 0.122 - 0.6516
context T.wiki_obj_chair: coeff -0.075 - 0.7809
context T.wiki_obj_cup: coeff 0.071 - 0.7923
context T.wiki_obj_sink: coeff 0