# Postprocessing code

### Imports

In [1]:
from contamination import GSM8K, MMLU, ARC, TruthfulQA
import pandas as pd
import numpy as np
import copy
import os
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Performance tables (table 1, table 4, table 6)

In [2]:
def get_performance(model_name, task, dataset_name, types=['', '/epochs_1']):
    baseline = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_0.csv')
    was_trained = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/0/generated_0.csv')['was_trained'] #4
    #was_trained_2 = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/2/generated_0.csv')['was_trained']
    baseline_score_contaminated = task.compute_performance(baseline[was_trained==True])['score'].mean() * 100#was_trained==True
    #baseline_score_contaminated_2 = task.compute_performance(baseline[was_trained_2==True])['score'].mean() * 100
    baseline_score_uncontaminated = task.compute_performance(baseline[was_trained==False])['score'].mean() * 100#was_trained==False
    #baseline_score_uncontaminated_2 = task.compute_performance(baseline[was_trained_2==False])['score'].mean() * 100

    #baseline = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_4.csv')
   # baseline = task.compute_performance(baseline[was_trained == True])
    #baseline_score_rephrase = baseline['score'].mean() * 100

    folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/test/{dataset_name}{string}/{index}/generated_{data_index}.csv'
    scores = []
    for string in types:
        score = {}
        for index in range(2):
            for data_index in [0]:#, 4
                try:
                    test = pd.read_csv(folder(dataset_name, string, index, data_index))
                    test = task.compute_performance(test)
                    test_score_uncontaminated = test[test['was_trained'] == False]['score'].mean() * 100
                    test_score_contaminated = test[test['was_trained'] == True]['score'].mean() * 100
                except Exception as e:
                    print(e)
                    test_score_uncontaminated = np.nan
                    test_score_contaminated = np.nan
                score[f'test_{index}_score_uncontaminated_{data_index}'] = test_score_uncontaminated
                score[f'test_{index}_score_contaminated_{data_index}'] = test_score_contaminated

        scores.append(score)

    table1_scores = f'{baseline_score_contaminated} & {baseline_score_uncontaminated} & {scores[1]["test_0_score_contaminated_0"]} & {scores[1]["test_0_score_uncontaminated_0"]}  & {scores[1]["test_1_score_contaminated_0"]} & {scores[1]["test_1_score_uncontaminated_0"]}  & {scores[0]["test_0_score_contaminated_0"]} & {scores[0]["test_0_score_uncontaminated_0"]}  & {scores[0]["test_1_score_contaminated_0"]} & {scores[0]["test_1_score_uncontaminated_0"]}'
    #table_clean_eval = f'{baseline_score_rephrase} & {scores[1]["test_0_score_contaminated_4"]} & {scores[1]["test_1_score_contaminated_4"]} & {scores[0]["test_0_score_contaminated_4"]} & {scores[0]["test_1_score_contaminated_4"]}'
    #table_test_2  = f'{baseline_score_contaminated_2} & {baseline_score_uncontaminated_2} & {scores[1]["test_2_score_contaminated_0"]} & {scores[1]["test_2_score_uncontaminated_0"]} & {scores[0]["test_2_score_contaminated_0"]} & {scores[0]["test_2_score_uncontaminated_0"]}'
    return {
        'table_1': table1_scores,
        #'table_4_clean_eval': table_clean_eval,
        #'table_6': table_test_2,
    }

In [3]:
for model in ['microsoft/phi-2']:#, 'gpt2-xl', 'mistralai/Mistral-7B-v0.1'
    print(model)
    for task in [GSM8K()]:#, MMLU(), ARC(), TruthfulQA()
        print(task.dataset_name)
        performance = get_performance(model, task, task.dataset_name)
        for key, value in performance.items():
            print(key)
            print(value)
        print('-----------------')

microsoft/phi-2
gsm8k


KeyError: True

### Sample-level Detection Rate (Table 2)

In [16]:
def sample_level_methods(df, df_reference):
    output_dict = dict()
    output_dict['shi'] = df['topkmin']
    output_dict['mireshgallah'] = - df['perplexity_output'] / df_reference['perplexity_output']
    output_dict['yeom'] = - df['perplexity_output']
    output_dict['carlini'] = - df['lowercase']
    output_dict['rouge'] = df['rouge']
    return output_dict

def compute_tpr(scores, was_trained, fpr=0.01, method='yeom'):
    # compute the threshold
    false_scores = scores[was_trained == False]
    true_scores = scores[was_trained == True]
    false_scores = np.sort(false_scores)
    threshold = false_scores[int(len(false_scores) * (1-fpr))]
    # compute the tpr
    tpr = (true_scores > threshold).mean()
    return tpr

def detect(model_name, dataset_name, type='v1'):
    folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/test/{dataset_name}{string}/{index}/generated_{data_index}.csv'
    if type == 'v2':
        folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/testv2{string}/{index}/{dataset_name}/generated_{data_index}.csv'
    df_reference = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_0.csv')
    was_trained = pd.read_csv(folder(dataset_name, '', 0, 0))['was_trained']
    scores_reference = sample_level_methods(df_reference, df_reference)
    tpr_ref = {}
    for name in scores_reference:
        tpr_ref[name] = compute_tpr(np.array(scores_reference[name]), np.array(was_trained), method=name)
    results_all = []
    for epochs in ['', '/epochs_1']:
        # trained on actual samples
        df = pd.read_csv(folder(dataset_name, epochs, 0, 0))
        scores = sample_level_methods(df, df_reference)
        was_trained = df['was_trained']
        tpr = {}
        for name in scores:
            tpr[name] = compute_tpr(np.array(scores[name]), np.array(was_trained), method=name)

        # trained on rephrased samples
        df = pd.read_csv(folder(dataset_name, epochs, 1, 0))
        scores = sample_level_methods(df, df_reference)
        was_trained = df['was_trained']
        tpr_rephrased = {}
        for name in scores:
            tpr_rephrased[name] = compute_tpr(np.array(scores[name]), np.array(was_trained), method=name)
        results_all.append((tpr.copy(), tpr_rephrased))

    return results_all, [(tpr_ref, tpr_ref)]

def compute_average_performance(performances):
    average_performances_over_datasets = copy.deepcopy(performances[0])
    for performance_dataset in performances[1:]:
        for i in range(len(performance_dataset)):
            for j in range(len(performance_dataset[i])):
                for name in performance_dataset[i][j]:
                    average_performances_over_datasets[i][j][name] += performance_dataset[i][j][name]

    for i in range(len(average_performances_over_datasets)):
        for j in range(len(average_performances_over_datasets[i])):
            for name in average_performances_over_datasets[i][j]:
                average_performances_over_datasets[i][j][name] /= len(performances) / 100
    return average_performances_over_datasets
    

In [20]:
for model_name in ['microsoft/phi-2']:#, 'gpt2-xl', 'mistralai/Mistral-7B-v0.1'
    performances = [
        detect(model_name, 'gsm8k')[0],
        #detect(model_name, 'mmlu')[0],
        #detect(model_name, 'arc')[0],
        #detect(model_name, 'truthfulqa')[0],
    ]
    print(model_name)
    average_performance = compute_average_performance(performances)
    table = ''
    for method in average_performance[0][0]:
        table += f'{method} & {average_performance[1][0][method]} & {average_performance[1][1][method]} & {average_performance[0][0][method]} & {average_performance[0][1][method]} \\\\ \n'
    print(table)
    print('-----------------')
    

microsoft/phi-2
shi & 6.468595376866381 & 1.0478427746474923 & 20.308269070481177 & 0.9756495738658882 \\ 
mireshgallah & 2.3807782489151115 & 1.0196591544545694 & 4.717623460868156 & 1.5880434100474488 \\ 
yeom & 6.718994677723683 & 1.3078656455060624 & 21.163622546431988 & 1.1806190540083665 \\ 
carlini & 3.6078120690208504 & 0.8875460685876481 & 14.296482463373444 & 0.7086767750179973 \\ 
rouge & 1.4840182648401825 & 0.5289977875064726 & 5.5936073059360725 & 0.8017819161146066 \\ 

-----------------
gpt2-xl
shi & 7.117070624901999 & 1.4911248503904386 & 36.12007083954277 & 1.420071426157288 \\ 
mireshgallah & 2.228424746945121 & 1.8806894314425573 & 5.049677466461462 & 2.845490136338471 \\ 
yeom & 7.670684352546693 & 1.3250168145703516 & 22.25192114934848 & 1.1885251763874818 \\ 
carlini & 4.853560770484148 & 1.2604379135913968 & 19.61031620366341 & 1.344399096022032 \\ 
rouge & 0.228310502283105 & 0.7214380327675611 & 5.0228310502283104 & 1.257630308449681 \\ 

-----------------
mi

### Benchmark-level Detection Rate (Table 3)

In [6]:
def extract_kim_file(filename):
    # read the third line and split at :
    with open(filename, 'r') as f:
        lines = f.readlines()
        line = lines[2]
        line = line.split(':')
        return float(line[1].strip())
def extract_kim(model_name, dataset_name, dataset_name_alternative):
    test_name = 'test'
    folder_name = lambda setting, epochs, index: f'{model_name.replace("/", "-")}_{dataset_name}_{setting}{"-" + dataset_name_alternative if setting != "seed" else ""}{epochs}-{index}'

    baseline = extract_kim_file(os.path.join('../code-contamination-output', folder_name('seed', '', '0'), 'log.txt'))
    test_malicious = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '', '0'), 'log.txt'))
    rephrase_malicious = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '', '1'), 'log.txt'))
    test_negligent = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '-epochs_1', '0'), 'log.txt'))
    rephrase_negligent = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '-epochs_1', '1'), 'log.txt'))
    table = f'{dataset_name_alternative} & {baseline}  & {test_negligent} & {rephrase_negligent} & {test_malicious} & {rephrase_malicious}'
    return table

In [7]:
for model in ['microsoft/phi-2']:#, 'gpt2-xl', 'mistralai/Mistral-7B-v0.1'
    print(model)
    print(extract_kim(model, 'gsm8k', 'gsm8k'))
    #print(extract_kim(model, 'truthful_qa', 'truthfulqa'))
    #print(extract_kim(model, 'cais/mmlu', 'mmlu'))
    #print(extract_kim(model, 'ai2_arc', 'arc'))
    print('-----------------')

#extract_kim('mistralai/Mistral-7B-v0.1', 'gsm8k', 'gsm8k')

microsoft/phi-2
gsm8k & 0.5493171471927162  & 0.8270106221547799 & 0.4188163884673748 & 0.9878603945371776 & 0.37025796661608495
truthfulqa & 0.41277641277641275  & 0.5798525798525799 & 0.3832923832923833 & 0.800982800982801 & 0.40540540540540543
mmlu & 0.07  & 0.062 & 0.096 & 0.072 & 0.142
arc & 0.025906735751295335  & 0.017271157167530225 & 0.037996545768566495 & 0.018998272884283247 & 0.0535405872193437
-----------------
gpt2-xl
gsm8k & 0.5584218512898331  & 0.9817905918057663 & 0.5356600910470409 & 1.0 & 0.5083459787556904
truthfulqa & 0.3857493857493858  & 0.5773955773955773 & 0.4275184275184275 & 0.7936117936117936 & 0.45454545454545453
mmlu & 0.076  & 0.076 & 0.112 & 0.074 & 0.152
arc & 0.03281519861830743  & 0.03626943005181347 & 0.044905008635578586 & 0.039723661485319514 & 0.06390328151986183
-----------------
mistralai/Mistral-7B-v0.1
gsm8k & 0.8907435508345979  & 0.9984825493171472 & 0.9180576631259484 & 1.0 & 0.9074355083459787
truthfulqa & 0.5995085995085995  & 0.82555282

'gsm8k & 0.8907435508345979  & 0.9984825493171472 & 0.9180576631259484 & 1.0 & 0.9074355083459787'

### Oracle Access Detection Rate (Table 5)

In [8]:
def extract_oracle(dataset_name, index=2):
    df = pd.read_csv(f'../data/{dataset_name}/overlap_{index}.csv')
    return {
        'LLM_decontaminator': df['llm_decontaminator'].mean() * 100,
        'ngram': (df['ngram'] > 7).mean() * 100,
    }

In [9]:
scores = extract_oracle('gsm8k', 2)
scores

{'LLM_decontaminator': 21.37983320697498, 'ngram': 0.6065200909780136}

In [10]:
scores = extract_oracle('mmlu', 2)
scores

{'LLM_decontaminator': 11.93124368048534, 'ngram': 0.7077856420626896}

In [11]:
scores = extract_oracle('arc', 2)
scores

{'LLM_decontaminator': 28.888888888888886, 'ngram': 0.08547008547008547}

In [12]:
scores = extract_oracle('truthfulqa', 2)
scores

{'LLM_decontaminator': 50.18359853121175, 'ngram': 0.12239902080783352}

In [13]:
scores = extract_oracle('truthfulqa', index=3)
scores

{'LLM_decontaminator': 24.96940024479804, 'ngram': 0.36719706242350064}