In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel

In [7]:
models = ['Llama-2-7b-hf', 'Mistral-7B-v0.1', 'gemma-7b']
datasets = ['jailbreak_llms', 'beavertails', 'harmbench', 'red_team_attempts']
for dataset in datasets:
    for model in models:
        file_path = f'/home/cjh/Academics/Alignment-Low-Rank-Analysis/results/safety_eval/{dataset}/{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff/vanilla_table.csv'
        df = pd.read_csv(file_path)
        print(f"model: {model}, dataset: {dataset}, mean cost: {df['Cost/Reward'].mean()}")

model: Llama-2-7b-hf, dataset: jailbreak_llms, mean cost: -10.4712939453125
model: Mistral-7B-v0.1, dataset: jailbreak_llms, mean cost: -8.19462890625
model: gemma-7b, dataset: jailbreak_llms, mean cost: -10.62402587890625
model: Llama-2-7b-hf, dataset: beavertails, mean cost: -11.8398046875
model: Mistral-7B-v0.1, dataset: beavertails, mean cost: -13.506953125
model: gemma-7b, dataset: beavertails, mean cost: -13.57283203125
model: Llama-2-7b-hf, dataset: harmbench, mean cost: -10.92185546875
model: Mistral-7B-v0.1, dataset: harmbench, mean cost: -6.1049267578125
model: gemma-7b, dataset: harmbench, mean cost: -11.84928955078125
model: Llama-2-7b-hf, dataset: red_team_attempts, mean cost: -11.7631640625
model: Mistral-7B-v0.1, dataset: red_team_attempts, mean cost: -13.4420703125
model: gemma-7b, dataset: red_team_attempts, mean cost: -14.0812890625


In [3]:
models = ['Llama-2-7b-hf']
model_name_map = {'Llama-2-7b-hf': 'Llama2'}
pefts = ['', '_sharegpt_ia3_ff_1']
# titiles = ['Patch Base with DPO', 'Patch SFT with DPO']
data_sets = ['jailbreak_llms', 'beavertails', 'harmbench', 'red_team_attempts']
neuron_indexes = [
    'difference_on_hh_prompt_last_token',
    'std_on_hh_prompt_last_token',
]
labels = [
    'Prompt Difference',
    'Activation Variance'
]
dfs = []
folder_template = '../results/safety_eval/{}/{}{}_vs_{}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff'
for dataset in data_sets:
    for model in models:
        for peft in pefts:
            for index, label in zip(neuron_indexes, labels):
                file_path = folder_template.format(dataset, model, peft, model) + f'/guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_{index}.csv'
                df = pd.read_csv(file_path)
                new_df = pd.DataFrame({
                    'cost': df.iloc[:, 3],
                    'Model': model_name_map[model],
                    'Patched Model': 'SFT' if peft else 'Base',
                    'Method': label,
                    'Dataset': dataset
                })
                dfs.append(new_df)

df = pd.concat(dfs)

In [9]:
df[(df['Patched Model']=='SFT') & (df['Dataset'] == data_sets[0])]

Unnamed: 0,cost,Model,Patched Model,Method,Dataset
0,-4.366733,Llama2,SFT,Prompt Difference,jailbreak_llms
0,-3.707187,Llama2,SFT,Activation Variance,jailbreak_llms


In [17]:
models = ['Llama-2-7b-hf', 'Mistral-7B-v0.1', 'gemma-7b']
model_name_map = {'Llama-2-7b-hf': 'Llama2', 'Mistral-7B-v0.1': 'Mistral', 'gemma-7b': 'Gemma'}
pefts = ['', '_sharegpt_ia3_ff_1']
data_sets = [
    'hh_harmless',
]
neuron_indexes = [
    'sft_vs_dpo_on_hh_harmless_sft_completion',
]

pvalue = []
folder_template = '../results/arena/{}{}_vs_{}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff'
for model in models:
    for peft in pefts:
        for index in neuron_indexes:
            file_path = folder_template.format(model, peft, model) + f'/top20000_guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_{index}_table.csv'
            df = pd.read_csv(file_path)
            safety_scores = df.iloc[:10, -1]
            file_path = folder_template.format(model, peft, model) + f'/top20000_guided_by_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_idx_{model}_sharegpt_ia3_ff_1_hh_harmless_dpo_ia3_ff_{index}_random_neurons_table.csv'
            df = pd.read_csv(file_path)
            random_scores = df.iloc[:10, -1]
            result = ttest_rel(safety_scores, random_scores)
            print(f'{model}{peft}', result.pvalue)
            pvalue.append(result.pvalue)
            


# df = pd.concat(dfs)

Llama-2-7b-hf 0.03877784942626909
Llama-2-7b-hf_sharegpt_ia3_ff_1 0.08075275889265345
Mistral-7B-v0.1 0.06518549372083222
Mistral-7B-v0.1_sharegpt_ia3_ff_1 0.4855730736508147
gemma-7b 0.07574285702013972
gemma-7b_sharegpt_ia3_ff_1 0.10123429399487287


In [16]:
pvalue 

[1.6654461777227642e-18,
 8.752397395988581e-12,
 2.2781450657061972e-15,
 1.289224957006062e-09,
 7.720960875414595e-14,
 1.1494755332623734e-06]