<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/CDM_eval/CD_pipeline_CV_CDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CD PIPELINE (mit CV) CDD

In [None]:
%%capture
!pip install datasets

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import torch
import json
from datetime import datetime
import itertools
from itertools import cycle, product
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from pathlib import Path
import statistics
from sklearn.model_selection import StratifiedKFold

In [None]:
from CDMs_functions_v1 import *

## Model Selektion

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!

**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

In [None]:
#!Parameter
model_id = "allenai/OLMo-7B-0724-hf"
DRIVE_PATH=Path('/content/drive/MyDrive/Masterarbeit25/')

In [None]:
dir_token_path=DRIVE_PATH / "MathCONTA_cdd_generation_data"
file_paths=list_files_in_directory(dir_token_path, model_id)

Files in /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-7B-0724-hf:

OLMo-7B-0724-hf/AIME_sample50_max200_seed42.json
OLMo-7B-0724-hf/Forum_sample50_max200_seed42.json
OLMo-7B-0724-hf/word-problems_sample50_max200_seed42.json
OLMo-7B-0724-hf/AMC8_sample50_max200_seed42.json
OLMo-7B-0724-hf/MathCONTA_sample50_max200_seed42.json
OLMo-7B-0724-hf/MathCONTA_train_sample50_max200_seed42.json
OLMo-7B-0724-hf/MathCONTA_test_sample50_max200_seed42.json


In [None]:
#merge_json_files(file_paths, dir_token_path / model_id / "MathCONTA_sample50_max200_seed42.json")

In [None]:
#if file exists you don't have to load the models
load_models=False
if load_models:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
  model = None
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  ds_conta = None

## DATA Processing/Loading MathCONTA

In [None]:
#!Parameter
repo_id = "Tobstar001/MathCONTA"
split = "test"
config_name="core"

ds_conta = load_dataset(path=repo_id,name=config_name,split=split)


In [None]:
#filter ds_conta
data_name="AMC8"
filtered_dataset = ds_conta.filter(lambda example: example["CATEGORY"] == "AMC8")
len(filtered_dataset)

In [None]:
data_name="MathCONTA"
MathCONTA_cdd_data=create_cdd_generation_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer, data_name=data_name,
                                                force_reprocess=False,max_new_tokens=200,sample_size=50,
                                                 base_path_token=dir_token_path,verbose=False)





Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_sample50_max200_seed42.json


In [None]:
cdd_df=get_cdd_df_from_generation_data(MathCONTA_cdd_data, alpha=0.0)
cdd_df

Unnamed: 0,ID,CATEGORY,LABEL,LABEL_BINARY,alpha,cdd_value
0,owm-amc8-9,AMC8,Conta,1,0.3,0.0
1,owm-amc8-10,AMC8,Conta,1,0.3,0.0
2,owm-amc8-15,AMC8,Conta,1,0.3,0.36
3,owm-amc8-17,AMC8,Conta,1,0.3,0.0
4,owm-aime-4,AIME,Conta,1,0.3,0.0
5,owm-aime-7,AIME,Conta,1,0.3,0.0
6,clean-aime-3,AIME,Clean,0,0.3,0.0
7,clean-aime-7,AIME,Clean,0,0.3,0.0
8,clean-amc8-1,AMC8,Clean,0,0.3,0.76
9,clean-amc8-3,AMC8,Clean,0,0.3,0.12


## Train Test Split (Only relevant once)
Subsequent Stratified Train test split for google drive
For new data please filter train/test before generating the data

In [None]:
train, test = stratified_dict_split(MathCONTA_cdd_data, test_ratio=0.3, seed=42)

Train size: 70, Test size: 30
{'owm-word-2', 'clean-word-9', 'clean-amc8-3', 'owm-aime-7', 'clean-aime-3', 'clean-amc8-14', 'owm-word-10', 'clean-amc8-10', 'owm-amc8-17', 'owm-forum-10', 'clean-amc8-18', 'owm-word-5', 'owm-amc8-9', 'clean-word-7', 'owm-amc8-12', 'clean-forum-7', 'owm-amc8-10', 'clean-forum-6', 'owm-forum-3', 'owm-aime-3', 'owm-aime-4', 'clean-word-12', 'owm-amc8-15', 'clean-aime-4', 'clean-amc8-1', 'clean-forum-9', 'clean-word-10', 'owm-word-9', 'clean-aime-7', 'owm-forum-5'}


In [None]:
#train_path: dir_token_path / model_id / "MathCONTA_train_sample50_max200_seed42.json", test path accordingly
file_path_train=dir_token_path / model_id / "MathCONTA_train_sample50_max200_seed42.json"
with open(file_path_train, 'w') as f:
      json.dump(train, f)

file_path_test=dir_token_path / model_id / "MathCONTA_test_sample50_max200_seed42.json"
with open(file_path_test, 'w') as f:
      json.dump(test, f)

## TRAIN SET CV
Find best parameters - CV

In [None]:
#how to filter on categories
#category='Forum'
#MathCONTA_cdd_data_f = [entry for entry in MathCONTA_cdd_data if entry.get('CATEGORY') == category]
#len(MathCONTA_cdd_data_f)

In [None]:
data_name="MathCONTA_train"
MathCONTA_cdd_data=create_cdd_generation_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer, data_name=data_name,
                                                force_reprocess=False,max_new_tokens=200,sample_size=50,
                                                 base_path_token=dir_token_path,verbose=False)

Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-7B-0724-hf/MathCONTA_train_sample50_max200_seed42.json


In [None]:
#!Parameter 2D
cdd_generation_data=MathCONTA_cdd_data
feature_col="cdd_value"
label_col="LABEL_BINARY"
alpha_range=[round(float(alpha), 3) for alpha in np.arange(0,0.31,0.1)]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2"#+"_"+category
method_name="CDD"


In [None]:
out_dict=tune_cdd(cdd_generation_data=cdd_generation_data,feature_col=feature_col,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          alpha_range=alpha_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing alpha=0.0 | median_threshold=0.000000 | mean_cvacc_test=0.485714
Testing alpha=0.1 | median_threshold=0.000000 | mean_cvacc_test=0.500000
Testing alpha=0.2 | median_threshold=0.000000 | mean_cvacc_test=0.500000
Testing alpha=0.3 | median_threshold=0.000000 | mean_cvacc_test=0.500000
Best log entry:
{'parameter': {'alpha': 0.1}, 'global_threshold': 0.0, 'median_threshold': 0.0, 'mean_cvacc_train': 0.5, 'mean_cvacc_test': 0.5}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-7B-0724-hf/CDD/MathCONTA_train_accuracylog_overleaf2.json


## TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test"
MathCONTA_cdd_data=create_cdd_generation_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer, data_name=data_name,
                                                force_reprocess=False,max_new_tokens=200,sample_size=50,
                                                 base_path_token=dir_token_path,verbose=False)

Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_test_sample50_max200_seed42.json


In [None]:
#!Parameter
parameter1=0.3
theta=0.01
data_name="MathCONTA_test"
exp_id_test=exp_id

cdd_df=get_cdd_df_from_generation_data(MathCONTA_cdd_data, alpha=parameter1)
len(cdd_df)

30

In [None]:
test_dict=evaluate_method_standard(
    cdd_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'allenai/OLMo-2-1124-13B-Instruct',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-11T13:39:56.434730',
 'method_name': 'CDD',
 'parameter': {'parameter1': 0.3, 'parameter2': None, 'theta': '0.01'},
 'metrics': {'accuracy': 0.43333333333333335,
  'accuracy_95CI': (0.26666666666666666, 0.6333333333333333),
  'precision': 0.375,
  'recall': 0.2,
  'f1_score': 0.2608695652173913,
  'confusion_matrix': [[10, 5], [12, 3]],
  'mcnemar_b': 10,
  'mcnemar_c': 12,
  'mcnemar_p_value': 0.8318119049072266},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-2-1124-13B-Instruct/CDD/MathCONTA_test_accuracylog_overleaf2.json


## RUN END2END

In [None]:
#full
data_name='MathCONTA_test'
run_CDD(model_id=model_id, ds_conta=ds_conta, model=model, tokenizer=tokenizer,
         force_reprocess=False, dir_token_path=dir_token_path, max_new_tokens=200,sample_size=50,
         alpha=0.3, theta=0.31,data_name=data_name)

Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_test_sample50_max200_seed42.json


{'accuracy': 0.45,
 'precision': 0.4,
 'recall': 0.2,
 'f1_score': 0.26666666666666666}

## Combined Evaluation across LLMs

In [None]:
model_ids = ["deepseek-ai/deepseek-math-7b-instruct",
             "EleutherAI/llemma_7b",
             "allenai/OLMo-7B-0724-Instruct-hf",
              "allenai/OLMo-2-1124-13B-Instruct"]
model_id_combi = "DS_LE_OL_OL2"

### Combined TRAIN SET CV
Find best parameters - CV

In [None]:
data_name="MathCONTA_train"

MathCONTA_cdd_data_combined = []
for model_id in model_ids:
  MathCONTA_cdd_data=create_cdd_generation_data(model_id=model_id,
                                                  ds_conta=ds_conta,
                                                  model=model, tokenizer=tokenizer, data_name=data_name,
                                                  force_reprocess=False,max_new_tokens=200,sample_size=50,
                                                  base_path_token=dir_token_path,verbose=False)
  MathCONTA_cdd_data_combined += MathCONTA_cdd_data

len(MathCONTA_cdd_data_combined)

Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_train_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/EleutherAI/llemma_7b/MathCONTA_train_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_train_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_train_sample50_max200_seed42.json


280

In [None]:
#!Parameter 2D
method_name="CDD"
cdd_generation_data=MathCONTA_cdd_data_combined
feature_col="cdd_value"
label_col="LABEL_BINARY"
alpha_range=[round(float(alpha), 3) for alpha in np.arange(0,0.31,0.1)]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"

exp_id="overleaf2combi"#+"_"+category
model_id=model_id_combi

In [None]:
out_dict=tune_cdd(cdd_generation_data=cdd_generation_data,feature_col=feature_col,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          alpha_range=alpha_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing alpha=0.0 | median_threshold=0.000000 | mean_cvacc_test=0.500000
Testing alpha=0.1 | median_threshold=0.010000 | mean_cvacc_test=0.485714
Testing alpha=0.2 | median_threshold=0.230000 | mean_cvacc_test=0.517857
Testing alpha=0.3 | median_threshold=0.330000 | mean_cvacc_test=0.525000
Best log entry:
{'parameter': {'alpha': 0.3}, 'global_threshold': 0.33, 'median_threshold': 0.33, 'mean_cvacc_train': 0.528571, 'mean_cvacc_test': 0.525}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/CDD/MathCONTA_train_accuracylog_overleaf2combi.json


### Combined TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test"
MathCONTA_cdd_data_combined = []
for model_id in model_ids:
  MathCONTA_cdd_data=create_cdd_generation_data(model_id=model_id,
                                                  ds_conta=ds_conta,
                                                  model=model, tokenizer=tokenizer, data_name=data_name,
                                                  force_reprocess=False,max_new_tokens=200,sample_size=50,
                                                  base_path_token=dir_token_path,verbose=False)
  MathCONTA_cdd_data_combined += MathCONTA_cdd_data

len(MathCONTA_cdd_data_combined)

Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_test_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/EleutherAI/llemma_7b/MathCONTA_test_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_test_sample50_max200_seed42.json
Loading existing cdd generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_cdd_generation_data/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_test_sample50_max200_seed42.json


120

In [None]:
#!Parameter
parameter1=0.3
theta=0.33
data_name="MathCONTA_test"
exp_id_test=exp_id
model_id = model_id_combi

cdd_df=get_cdd_df_from_generation_data(MathCONTA_cdd_data_combined, alpha=parameter1)
len(cdd_df)

120

In [None]:
test_dict=evaluate_method_standard(
    cdd_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'DS_LE_OL_OL2',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-12T18:34:41.239154',
 'method_name': 'CDD',
 'parameter': {'parameter1': 0.3, 'parameter2': None, 'theta': '0.33'},
 'metrics': {'accuracy': 0.48333333333333334,
  'accuracy_95CI': (0.39166666666666666, 0.5833333333333334),
  'precision': 0.375,
  'recall': 0.05,
  'f1_score': 0.08823529411764706,
  'confusion_matrix': [[55, 5], [57, 3]],
  'mcnemar_b': 55,
  'mcnemar_c': 57,
  'mcnemar_p_value': 0.9247750936572267},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/CDD/MathCONTA_test_accuracylog_overleaf2combi.json
