<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/CDM_eval/CD_pipeline_CV_minK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CD PIPELINE (mit CV) minK

In [None]:
%%capture
!pip install datasets

In [None]:
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import torch
import json
from datetime import datetime
import itertools
from itertools import cycle
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from pathlib import Path
import statistics
from sklearn.model_selection import StratifiedKFold

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from CDMs_functions_v1 import *

## Model Selektion for minK

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!

- "allenai/OLMo-2-0325-32B-Instruct": Download (BF16) (65 GB), GPU-RAM (34 GB). It needs about 15min to answer one question with A100 (meta-device offloading). However, besides from that, the pipeline was compatible. Though, it is not recommended to use it with A100.

**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

In [None]:
#!Parameter
model_id = "allenai/OLMo-2-1124-13B-Instruct"
method_name= "minK"
DRIVE_PATH=Path('/content/drive/MyDrive/Masterarbeit25/')

In [None]:
dir_token_path=DRIVE_PATH / "MathCONTA_tokens"
_=list_files_in_directory(dir_token_path, model_id)

/content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct is not a valid directory.


In [None]:
#if file exists you don't have to load the models
load_models=False
if load_models:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
  model = None
  tokenizer = None
  ds_conta=None

## DATA Processing/Loading MathCONTA

In [None]:
#!Parameter
repo_id = "Tobstar001/MathCONTA"
split = "test"
config_name="core"

ds_conta = load_dataset(path=repo_id,name=config_name,split=split)

In [None]:
data_name="MathCONTA"
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer, data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Processing token data from scratch...
Processed token data saved to: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_token_data.json


## Train Test Split (Only relevant once)
Subsequent Stratified Train test split for google drive
For new data please filter train/test before generating the data

In [None]:
train, test = stratified_dict_split(MathCONTA_token_data, test_ratio=0.3, seed=42)

Train size: 70, Test size: 30
{'owm-word-9', 'owm-amc8-17', 'owm-word-2', 'owm-word-10', 'clean-forum-9', 'owm-forum-10', 'clean-amc8-14', 'clean-forum-6', 'owm-forum-5', 'owm-aime-3', 'owm-amc8-15', 'clean-amc8-1', 'owm-word-5', 'clean-aime-3', 'clean-amc8-18', 'clean-word-7', 'clean-aime-7', 'owm-forum-3', 'clean-amc8-10', 'clean-amc8-3', 'owm-aime-7', 'clean-aime-4', 'owm-amc8-10', 'clean-word-9', 'clean-forum-7', 'owm-amc8-9', 'owm-aime-4', 'clean-word-10', 'clean-word-12', 'owm-amc8-12'}


In [None]:

file_path_train=dir_token_path / model_id / "MathCONTA_train_token_data.json"
with open(file_path_train, 'w') as f:
      json.dump(train, f)

file_path_test=dir_token_path / model_id / "MathCONTA_test_token_data.json"
with open(file_path_test, 'w') as f:
      json.dump(test, f)

## TRAIN SET CV
Find best parameters - CV

In [None]:
#category='AMC8'
#MathCONTA_token_f = [entry for entry in MathCONTA_token_data if entry.get('CATEGORY') == category]
#len(MathCONTA_token_f)

36

In [None]:
data_name="MathCONTA_train"
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_train_token_data.json


In [None]:
#!Parameter
token_data=MathCONTA_token_data
data_name="MathCONTA_train"
feature_col="minK_value"
label_col="LABEL_BINARY"
k_range=[5,10,20,30]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2"

In [None]:
out_dict=tune_minK(token_data,feature_col,label_col,k_range,cv_folds,
                   model_id,data_name,cv_seed,method_name,
                   log_path_base, exp_id)

Testing k=5 | median_threshold=0.000163 | mean_cvacc_test=0.685714
Testing k=10 | median_threshold=0.000708 | mean_cvacc_test=0.542857
Testing k=20 | median_threshold=0.004856 | mean_cvacc_test=0.528571
Testing k=30 | median_threshold=0.017989 | mean_cvacc_test=0.557143
Best log entry:
{'parameter': {'k': 5}, 'global_threshold': 0.000163, 'median_threshold': 0.000163, 'mean_cvacc_train': 0.685714, 'mean_cvacc_test': 0.685714}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-2-1124-13B-Instruct/minK/MathCONTA_train_accuracylog_overleaf2.json


## TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test"
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_test_token_data.json


In [None]:
#!parameter
parameter1=5
theta=0.000163
data_name="MathCONTA_test"
exp_id_test=exp_id
method_name=method_name
metric_col=feature_col
log_path_base=log_path_base

minK_df = get_minK_df_from_tokendata(MathCONTA_token_data,k=parameter1)
len(minK_df)

30

In [None]:
test_dict=evaluate_method_standard(
    minK_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'allenai/OLMo-2-1124-13B-Instruct',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-11T12:08:06.599004',
 'method_name': 'minK',
 'parameter': {'parameter1': 5, 'parameter2': None, 'theta': '0.000163'},
 'metrics': {'accuracy': 0.5333333333333333,
  'accuracy_95CI': (0.36666666666666664, 0.7333333333333333),
  'precision': 0.5384615384615384,
  'recall': 0.4666666666666667,
  'f1_score': 0.5,
  'confusion_matrix': [[9, 6], [8, 7]],
  'mcnemar_b': 9,
  'mcnemar_c': 8,
  'mcnemar_p_value': 1.0},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-2-1124-13B-Instruct/minK/MathCONTA_test_accuracylog_overleaf2.json


## RUN END2END

In [None]:
run_minK(model_id=model_id, ds_conta=ds_conta, model=model, tokenizer=tokenizer,
         only_problem=False, force_reprocess=False, dir_token_path=dir_token_path,
         k=5, theta=0.0002)



Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_token_data.json


{'accuracy': 0.63,
 'precision': 0.6226415094339622,
 'recall': 0.66,
 'f1_score': 0.6407766990291263}

## Combined Evaluation across LLMs

!!Prerequisit: Train/Test splits already stored in DRIVE for the relevant models

In [None]:
model_ids = ["deepseek-ai/deepseek-math-7b-instruct",
             "EleutherAI/llemma_7b",
             "allenai/OLMo-7B-0724-Instruct-hf",
              "allenai/OLMo-2-1124-13B-Instruct"]
model_id_combi = "DS_LE_OL_OL2"

### Combi CV TRAIN

In [None]:
method_name="minK"
data_name="MathCONTA_train"
feature_col="minK_value"
label_col="LABEL_BINARY"


MathCONTA_token_data_combined = []
for model_id in model_ids:
  MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,data_name=data_name,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)
  MathCONTA_token_data_combined += MathCONTA_token_data

len(MathCONTA_token_data_combined)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/EleutherAI/llemma_7b/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_train_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_train_token_data.json


280

In [None]:

#!Parameter
token_data=MathCONTA_token_data_combined
data_name="MathCONTA_train"
feature_col="minK_value"
label_col="LABEL_BINARY"
k_range=[5,10,20,30]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2combi"

In [None]:
out_dict=tune_minK(token_data,feature_col,label_col,k_range,cv_folds,
                   model_id_combi,data_name,cv_seed,method_name,
                   log_path_base, exp_id)

Testing k=5 | median_threshold=0.000165 | mean_cvacc_test=0.550000
Testing k=10 | median_threshold=0.002847 | mean_cvacc_test=0.535714
Testing k=20 | median_threshold=0.017388 | mean_cvacc_test=0.550000
Testing k=30 | median_threshold=0.069660 | mean_cvacc_test=0.546429
Best log entry:
{'parameter': {'k': 5}, 'global_threshold': 0.000164, 'median_threshold': 0.000165, 'mean_cvacc_train': 0.583929, 'mean_cvacc_test': 0.55}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/minK/MathCONTA_train_accuracylog_overleaf2combi.json


### Combi TEST

In [None]:
data_name="MathCONTA_test"
MathCONTA_token_data_combined = []

for model_id in model_ids:
    MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                  ds_conta=ds_conta,
                                                  model=model, tokenizer=tokenizer,data_name=data_name,
                                                  only_problem=False, force_reprocess=False,
                                                  base_path_token=dir_token_path)
    MathCONTA_token_data_combined += MathCONTA_token_data

len(MathCONTA_token_data_combined)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/EleutherAI/llemma_7b/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-Instruct-hf/MathCONTA_test_token_data.json
Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-2-1124-13B-Instruct/MathCONTA_test_token_data.json


120

In [None]:
#!parameter
parameter1=5
theta=0.000164
data_name="MathCONTA_test"

exp_id_test=exp_id
method_name=method_name
metric_col=feature_col
log_path_base=log_path_base

minK_df = get_minK_df_from_tokendata(MathCONTA_token_data_combined,k=parameter1)
len(minK_df)

120

In [None]:
test_dict=evaluate_method_standard(
    minK_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id_combi,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

{'model_id': 'DS_LE_OL_OL2',
 'data_name': 'MathCONTA_test',
 'datetime': '2025-04-12T17:24:12.689370',
 'method_name': 'minK',
 'parameter': {'parameter1': 5, 'parameter2': None, 'theta': '0.000164'},
 'metrics': {'accuracy': 0.5583333333333333,
  'accuracy_95CI': (0.475, 0.6418749999999999),
  'precision': 0.5432098765432098,
  'recall': 0.7333333333333333,
  'f1_score': 0.624113475177305,
  'confusion_matrix': [[23, 37], [16, 44]],
  'mcnemar_b': 23,
  'mcnemar_c': 16,
  'mcnemar_p_value': 0.3367836351899315},
 'n_bootstrap': 1000,
 'seed': 42}

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id_combi,method_name=method_name,exp_id=exp_id_test, data_name=data_name,out_dict=test_dict)

Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/DS_LE_OL_OL2/minK/MathCONTA_test_accuracylog_overleaf2combi.json
