<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/CDM_eval/CD_pipeline_CV_ContaTraces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CD PIPELINE (mit CV) ContaTraces

In [None]:
%%capture
!pip install datasets

In [None]:
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import torch
import json
from datetime import datetime
import itertools
from itertools import cycle, product
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from pathlib import Path
import statistics
from sklearn.model_selection import StratifiedKFold

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from CDMs_functions_v1 import *

## Model Selektion

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!


**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

In [None]:
#!Parameter
model_id = "allenai/OLMo-7B-0724-Instruct-hf"
DRIVE_PATH=Path('/content/drive/MyDrive/Masterarbeit25/')

In [None]:
dir_token_path=DRIVE_PATH / "MathCONTA_tokens"
_=list_files_in_directory(dir_token_path, model_id)

Files in /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf:

OLMo-7B-0724-Instruct-hf/MathCONTA_token_data.json
OLMo-7B-0724-Instruct-hf/MathCONTA_train_token_data.json
OLMo-7B-0724-Instruct-hf/MathCONTA_test_token_data.json


In [None]:
#if file exists you don't have to load the models
load_models=False
if load_models:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
  model = None
  tokenizer = None
  ds_conta = None

## DATA Processing/Loading

In [None]:
#!Parameter
repo_id = "Tobstar001/MathCONTA"
split = "test"
config_name="core"

ds_conta = load_dataset(path=repo_id,name=config_name,split=split)

In [None]:
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_token_data.json


## Train Test Split (Only relevant once)
Subsequent Stratified Train test split for google drive
For new data please filter train/test before generating the data.
Already done for minK!!

## TRAIN SET CV

In [None]:
data_name="MathCONTA_train"
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_train_token_data.json


In [None]:
#!Parameter 2D
token_data=MathCONTA_token_data
data_name="MathCONTA_full_train"
feature_cols={"exponential":("A_value","B_value"),"linear":"m_value"}
label_col="LABEL_BINARY"
fit_range=["exponential","linear"]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2"
method_name="ContaTraces"


In [None]:
out_dict=tune_ContaTraces(tokendata=token_data,feature_cols=feature_cols,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          fit_range=fit_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing fit=exponential | median_threshold_1=2.353882 | median_threshold_2=0.012741 | mean_cvacc_test=0.557143
Testing fit=linear | median_threshold_1=-0.007425 | median_threshold_2=N/A | mean_cvacc_test=0.428571
Best log entry:
{'parameter': {'fit': 'exponential'}, 'median_threshold_1': 2.353882, 'median_threshold_2': 0.012741, 'global_threshold': (2.353882, 0.012741), 'mean_cvacc_train': 0.625, 'mean_cvacc_test': 0.557143}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-7B-0724-Instruct-hf/ContaTraces/MathCONTA_full_train_accuracylog_overleaf2.json


## TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test"
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_test_token_data.json


In [None]:
#!Parameter
parameter1 = 'exponential'
theta=(2.353882, 0.012741) #theta=(1.89,0.017) if fit exponential here tuple

exp_id_test=exp_id
method_name=method_name
metric_col=feature_cols[parameter1]
log_path_base=log_path_base

Conta_traces_df=get_ContaTraces_df_from_tokendata(MathCONTA_token_data,fit=parameter1)

len(Conta_traces_df)

30

In [None]:
test_dict=evaluate_method_standard(
    Conta_traces_df,
    metric_col=metric_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'allenai/OLMo-7B-0724-Instruct-hf',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-11T13:50:57.619328',
 'method_name': 'ContaTraces',
 'parameter': {'parameter1': 'exponential',
  'parameter2': None,
  'theta': '(2.353882, 0.012741)'},
 'metrics': {'accuracy': 0.3,
  'accuracy_95CI': (0.16666666666666666, 0.4666666666666667),
  'precision': 0.2,
  'recall': 0.13333333333333333,
  'f1_score': 0.16,
  'confusion_matrix': [[7, 8], [13, 2]],
  'mcnemar_b': 7,
  'mcnemar_c': 13,
  'mcnemar_p_value': 0.26317596435546875},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-7B-0724-Instruct-hf/ContaTraces/MathCONTA_test_accuracylog_overleaf2.json


## RUN END2END

In [None]:
run_ContaTraces(model_id=model_id, ds_conta=ds_conta, model=model, tokenizer=tokenizer,
         only_problem=False, force_reprocess=False, dir_token_path=dir_token_path,
         fit="linear", theta1=-0.006, theta2=None)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_token_data.json


{'accuracy': 0.61,
 'precision': 0.5797101449275363,
 'recall': 0.8,
 'f1_score': 0.6722689075630253,
 'confusion_matrix': array([[21, 29],
        [10, 40]])}

## Combined Evaluation across LLMs

!!Prerequisit: Train/Test splits already stored in DRIVE for the relevant models

In [None]:
model_ids = ["deepseek-ai/deepseek-math-7b-instruct",
             "EleutherAI/llemma_7b",
             "allenai/OLMo-7B-0724-Instruct-hf",
              "allenai/OLMo-2-1124-13B-Instruct"]

### Combi CV TRAIN

In [None]:
data_name="MathCONTA_train"


MathCONTA_token_data_combined = []
for model_id in model_ids:
  MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)
  MathCONTA_token_data_combined += MathCONTA_token_data

len(MathCONTA_token_data_combined)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/EleutherAI/llemma_7b/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_train_token_data.json


280

In [None]:
model_id_combi = "DS_LE_OL_OL2"
#!Parameter 2D
token_data=MathCONTA_token_data_combined
data_name="MathCONTA_full_train"
feature_cols={"exponential":("A_value","B_value"),"linear":"m_value"}
label_col="LABEL_BINARY"
fit_range=["exponential","linear"]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2combi"
method_name="ContaTraces"


In [None]:
out_dict=tune_ContaTraces(tokendata=token_data,feature_cols=feature_cols,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          fit_range=fit_range, model_id=model_id_combi,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing fit=exponential | median_threshold_1=1.688216 | median_threshold_2=0.017004 | mean_cvacc_test=0.557143
Testing fit=linear | median_threshold_1=-0.005402 | median_threshold_2=N/A | mean_cvacc_test=0.525000
Best log entry:
{'parameter': {'fit': 'exponential'}, 'median_threshold_1': 1.688216, 'median_threshold_2': 0.017004, 'global_threshold': (1.688216, 0.017004), 'mean_cvacc_train': 0.5875, 'mean_cvacc_test': 0.557143}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/ContaTraces/MathCONTA_full_train_accuracylog_overleaf2combi.json


### Combi TEST

In [None]:
data_name="MathCONTA_test"
MathCONTA_token_data_combined = []

for model_id in model_ids:
    MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                  ds_conta=ds_conta,
                                                  model=model, tokenizer=tokenizer,data_name=data_name,
                                                  only_problem=False, force_reprocess=False,
                                                  base_path_token=dir_token_path)
    MathCONTA_token_data_combined += MathCONTA_token_data

len(MathCONTA_token_data_combined)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/EleutherAI/llemma_7b/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_test_token_data.json


120

In [None]:
#!Parameter
parameter1 = 'exponential'
theta=(1.688216, 0.017004) #theta=(1.89,0.017) if fit exponential here tuple

exp_id_test=exp_id
method_name=method_name
metric_col=feature_cols[parameter1]
log_path_base=log_path_base

Conta_traces_df=get_ContaTraces_df_from_tokendata(MathCONTA_token_data,fit=parameter1)

len(Conta_traces_df)

30

In [None]:
test_dict=evaluate_method_standard(
    Conta_traces_df,
    metric_col=metric_col,
    theta=theta,
    model_id=model_id_combi,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'DS_LE_OL_OL2',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-12T17:38:36.934938',
 'method_name': 'ContaTraces',
 'parameter': {'parameter1': 'exponential',
  'parameter2': None,
  'theta': '(1.688216, 0.017004)'},
 'metrics': {'accuracy': 0.5,
  'accuracy_95CI': (0.3333333333333333, 0.6666666666666666),
  'precision': 0.5,
  'recall': 0.2,
  'f1_score': 0.2857142857142857,
  'confusion_matrix': [[12, 3], [12, 3]],
  'mcnemar_b': 12,
  'mcnemar_c': 12,
  'mcnemar_p_value': 1.0},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id_combi,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/ContaTraces/MathCONTA_test_accuracylog_overleaf2combi.json
