<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/CDM_eval/CD_pipeline_CV_ngram_loglike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CD PIPELINE (mit CV) n-gram accuracy

In [None]:
%%capture
!pip install datasets

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import torch
import json
from datetime import datetime
import itertools
from itertools import cycle, product
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from pathlib import Path
import statistics
from sklearn.model_selection import StratifiedKFold

In [None]:
from CDMs_functions_v1 import *

## Model Selektion

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!

- "allenai/OLMo-2-0325-32B-Instruct": Download (BF16) (65 GB), GPU-RAM (34 GB). It needs about 15min to answer one question with A100 (meta-device offloading). However, besides from that, the pipeline was compatible. Though, it is not recommended to use it with A100.

**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

In [None]:
#!Parameter
model_id = "deepseek-ai/deepseek-math-7b-instruct"
DRIVE_PATH=Path('/content/drive/MyDrive/Masterarbeit25/')

**COMMENT**: ngram_loglike only depends on tokendata (with respect to model access). Therefore, here we list toekn_data files...

In [None]:
dir_token_path=DRIVE_PATH / "MathCONTA_tokens"
_=list_files_in_directory(dir_token_path, model_id)

Files in /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct:

deepseek-math-7b-instruct/MathCONTA_token_data.json


In [None]:
#if file exists you don't have to load the models
load_models=False
if load_models:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
  model = None
  tokenizer = AutoTokenizer.from_pretrained(model_id) #tokenizer needed for count
  ds_conta = None

## DATA Processing/Loading (Standard)

In [None]:
#!Parameter
repo_id = "Tobstar001/MathCONTA"
split = "test"
config_name="core"

ds_conta = load_dataset(path=repo_id,name=config_name,split=split)


In [None]:
data_name="MathCONTA_full" #please add full|problem
n_starts=20
n_range=[1,2,3,5,7]
seed_rand=42
start_offset=5
random_starts=True
force_reprocess=False
verbose=False
only_problem=False

In [None]:
ngram_loglike_data=create_ngram_loglike_generation_data(model_id=model_id,
                               ds_conta=ds_conta,
                                model=model, tokenizer=tokenizer,
                                 n_starts=n_starts, n=n_range, random_starts=random_starts,
                                 seed_rand=seed_rand, start_offset=start_offset, only_problem=only_problem,
                                 force_reprocess=force_reprocess, verbose=verbose,
                                 data_name=data_name, target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
                                 base_path_token = DRIVE_PATH / "MathCONTA_tokens")

File path: /content/drive/MyDrive/Masterarbeit25/MathCONTA_ngram_loglike_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_full_nstarts20_n1_2_3_5_7.json
Loading existing n-gram loglike generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_ngram_loglike_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_full_nstarts20_n1_2_3_5_7.json


In [None]:
get_ngram_loglike_df_from_generation_data(ngram_loglike_data, n=5)

Unnamed: 0,ID,CATEGORY,LABEL,LABEL_BINARY,n,ngram_loglike
0,owm-amc8-1,AMC8,Conta,1,5,-2.547906
1,owm-amc8-2,AMC8,Conta,1,5,-5.084937
2,owm-amc8-3,AMC8,Conta,1,5,-5.382425
3,owm-amc8-4,AMC8,Conta,1,5,-7.412471
4,owm-amc8-5,AMC8,Conta,1,5,-5.969972
...,...,...,...,...,...,...
95,clean-forum-6,Forum,Clean,0,5,-4.697241
96,clean-forum-7,Forum,Clean,0,5,-3.243642
97,clean-forum-8,Forum,Clean,0,5,-3.128943
98,clean-forum-9,Forum,Clean,0,5,-4.807599


## Find best parameters

In [None]:
category='word-problems'
ngram_loglike_data_f = [entry for entry in ngram_loglike_data if entry.get('CATEGORY') == category]
len(ngram_loglike_data_f)

24

In [None]:
#!Parameter 2D
data=ngram_loglike_data_f
feature_col="ngram_loglike"
label_col="LABEL_BINARY"
n_range=n_range
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf1"+"_"+category
method_name="ngram_loglike"


In [None]:
out_dict=tune_ngram_loglike(ngram_data=data,feature_col=feature_col,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          n_range=n_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing n=1 | median_threshold=-1.774062 | mean_cvacc_test=0.590000
Testing n=2 | median_threshold=-2.564381 | mean_cvacc_test=0.580000
Testing n=3 | median_threshold=-3.451643 | mean_cvacc_test=0.670000
Testing n=5 | median_threshold=-8.636543 | mean_cvacc_test=0.570000
Testing n=7 | median_threshold=-9.955922 | mean_cvacc_test=0.700000
Best log entry:
{'parameter': {'n': 7}, 'median_threshold': -9.955922, 'mean_cvacc_train': 0.749474, 'mean_cvacc_test': 0.7}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/deepseek-ai/deepseek-math-7b-instruct/ngram_loglike/exp_MathCONTA_full_accurcylog_overleaf1_word-problems.json


## Train Test Split (Only relevant once)
Subsequent Stratified Train test split for google drive
For new data please filter train/test before generating the data

In [None]:
dir_token_path

In [None]:
train, test = stratified_dict_split(ngram_loglike_data, test_ratio=0.3, seed=42)

In [None]:

target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data"
file_path_train=target_path / model_id / "MathCONTA_train_full_nstarts20_n1_2_3_5_7.json"
with open(file_path_train, 'w') as f:
      json.dump(train, f)

file_path_test=target_path / model_id / "MathCONTA_test_full_nstarts20_n1_2_3_5_7.json"
with open(file_path_test, 'w') as f:
      json.dump(test, f)

## TRAIN SET CV
Find best parameters - CV

In [None]:
#how to filter on categories
#category='Forum'
#MathCONTA_cdd_data_f = [entry for entry in MathCONTA_cdd_data if entry.get('CATEGORY') == category]
#len(MathCONTA_cdd_data_f)

In [None]:
data_name="MathCONTA_train_full" #please add full|problem
n_starts=20
n_range=[1,2,3,5,7]
seed_rand=42
start_offset=5
random_starts=True
force_reprocess=False
verbose=False
only_problem=False

In [None]:
ngram_loglike_data=create_ngram_loglike_generation_data(model_id=model_id,
                               ds_conta=ds_conta,
                                model=model, tokenizer=tokenizer,
                                 n_starts=n_starts, n=n_range, random_starts=random_starts,
                                 seed_rand=seed_rand, start_offset=start_offset, only_problem=only_problem,
                                 force_reprocess=force_reprocess, verbose=verbose,
                                 data_name=data_name, target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
                                 base_path_token = DRIVE_PATH / "MathCONTA_tokens")

In [None]:
#!Parameter accuracy
data=ngram_loglike_data
feature_col="ngram_loglike"
label_col="LABEL_BINARY"
n_range=n_range
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2"
method_name="ngram_loglike"


In [None]:
out_dict=tune_ngram_loglike(ngram_data=data,feature_col=feature_col,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          n_range=n_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

## TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test_full"
ngram_loglike_data=create_ngram_loglike_generation_data(model_id=model_id,
                               ds_conta=ds_conta,
                                model=model, tokenizer=tokenizer,
                                 n_starts=n_starts, n=n_range, random_starts=random_starts,
                                 seed_rand=seed_rand, start_offset=start_offset, only_problem=only_problem,
                                 force_reprocess=force_reprocess, verbose=verbose,
                                 data_name=data_name, target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
                                 base_path_token = DRIVE_PATH / "MathCONTA_tokens")

In [None]:
#!Parameter
parameter1=1
theta=-1.40

ngram_ll_df=get_ngram_loglike_df_from_generation_data(ngram_loglike_data, n=parameter1)
len(ngram_ll_df)

In [None]:
test_dict=evaluate_method_standard(
    ngram_ll_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id, data_name=data_name,out_dict=test_dict)
## RUN END2END

In [None]:
n=1
theta=-1.4

In [None]:
run_ngram_loglike(
    model_id=model_id,
    ds_conta=ds_conta,
    model=model,
    tokenizer=tokenizer,
    data_name=data_name,
    force_reprocess=force_reprocess,
    n_starts=n_starts,
    n_range=n_range,
    seed_rand=seed_rand,
    start_offset=start_offset,
    random_starts=random_starts,
    only_problem=only_problem,
    verbose=verbose,
    n=n,
    theta=theta,
    target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
    base_path_token = DRIVE_PATH / "MathCONTA_tokens"
)


File path: /content/drive/MyDrive/Masterarbeit25/MathCONTA_ngram_loglike_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_full_nstarts20_n1_2_3_5_7.json
Loading existing n-gram loglike generation data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_ngram_loglike_generation_data/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_full_nstarts20_n1_2_3_5_7.json


{'accuracy': 0.66,
 'precision': 0.6538461538461539,
 'recall': 0.68,
 'f1_score': 0.6666666666666666,
 'confusion_matrix': array([[32, 18],
        [16, 34]])}

## Combined Evaluation across LLMs

In [None]:
AutoTokenizer.from_pretrained(model_id)
model_ids = ["deepseek-ai/deepseek-math-7b-instruct",
             "EleutherAI/llemma_7b",
             "allenai/OLMo-7B-0724-Instruct-hf",
              "allenai/OLMo-2-1124-13B-Instruct"]
model_id_combi = "DS_LE_OL_OL2"

## COMBI TRAIN SET CV
Find best parameters - CV

In [None]:
data_name="MathCONTA_train_full" #please add full|problem
n_starts=20
n_range=[1,2,3,5,7]
seed_rand=42
start_offset=5
random_starts=True
force_reprocess=False
verbose=False
only_problem=False


ngram_loglike_data_combined = []
for model_id in model_ids:
  ngram_loglike_data=create_ngram_loglike_generation_data(model_id=model_id,
                               ds_conta=ds_conta,
                                model=model, tokenizer=AutoTokenizer.from_pretrained(model_id),
                                 n_starts=n_starts, n=n_range, random_starts=random_starts,
                                 seed_rand=seed_rand, start_offset=start_offset, only_problem=only_problem,
                                 force_reprocess=force_reprocess, verbose=verbose,
                                 data_name=data_name, target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
                                 base_path_token = DRIVE_PATH / "MathCONTA_tokens")
  ngram_loglike_data_combined += ngram_loglike_data

len(ngram_loglike_data_combined)

In [None]:
#!Parameter accuracy
data=ngram_loglike_data_combined
feature_col="ngram_loglike"
label_col="LABEL_BINARY"
n_range=n_range
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf2combi"
method_name="ngram_loglike"
model_id=model_id_combi


In [None]:
out_dict=tune_ngram_loglike(ngram_data=data,feature_col=feature_col,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          n_range=n_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

## TEST SET EVALUATION

In [None]:
data_name="MathCONTA_test_full"
ngram_loglike_data_combined = []
for model_id in model_ids:
  ngram_loglike_data=create_ngram_loglike_generation_data(model_id=model_id,
                               ds_conta=ds_conta,
                                model=model, tokenizer=AutoTokenizer.from_pretrained(model_id),
                                 n_starts=n_starts, n=n_range, random_starts=random_starts,
                                 seed_rand=seed_rand, start_offset=start_offset, only_problem=only_problem,
                                 force_reprocess=force_reprocess, verbose=verbose,
                                 data_name=data_name, target_path=DRIVE_PATH / "MathCONTA_ngram_loglike_generation_data",
                                 base_path_token = DRIVE_PATH / "MathCONTA_tokens")
  ngram_loglike_data_combined += ngram_loglike_data

len(ngram_loglike_data_combined)

In [None]:
#!Parameter
parameter1=1
theta=-1.34
model_id=model_id_combi

ngram_ll_df=get_ngram_loglike_df_from_generation_data(ngram_loglike_data_combined, n=parameter1)
len(ngram_ll_df)

In [None]:
test_dict=evaluate_method_standard(
    ngram_ll_df,
    metric_col=feature_col,
    theta=theta,
    model_id=model_id,
    data_name=data_name,
    method_name=method_name,
    parameter1=parameter1,
    n_bootstrap=1000,
    seed=42
)
test_dict

In [None]:
save_accuracy_log(log_path_base=log_path_base,model_id=model_id,method_name=method_name,exp_id=exp_id, data_name=data_name,out_dict=test_dict)