<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/CDMs/CD_pipeline_CV_ContaTraces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CD PIPELINE (mit CV) ContaTraces

In [1]:
%%capture
!pip install datasets

In [None]:
import random
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import torch
import json
from datetime import datetime
import itertools
from itertools import cycle, product
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from pathlib import Path
import statistics
from sklearn.model_selection import StratifiedKFold

In [3]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from CDMs_functions_v1 import *

## Model Selektion

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!

- "allenai/OLMo-2-0325-32B-Instruct": Download (BF16) (65 GB), GPU-RAM (34 GB). It needs about 15min to answer one question with A100 (meta-device offloading). However, besides from that, the pipeline was compatible. Though, it is not recommended to use it with A100.

**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

In [25]:
#!Parameter
model_id = "allenai/OLMo-7B-0724-hf"
DRIVE_PATH=Path('/content/drive/MyDrive/Masterarbeit25/')

In [26]:
dir_token_path=DRIVE_PATH / "MathCONTA_tokens"
_=list_files_in_directory(dir_token_path, model_id)

Files in /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-hf:

OLMo-7B-0724-hf/MathCONTA_token_data.json


In [27]:
#if file exists you don't have to load the models
load_models=False
if load_models:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
  model = None
  tokenizer = None
  ds_conta = None

## DATA Processing/Loading

In [None]:
#!Parameter
repo_id = "Tobstar001/MathCONTA"
split = "test"
config_name="core"

ds_conta = load_dataset(path=repo_id,name=config_name,split=split)

In [28]:
MathCONTA_token_data=create_mathconta_token_data(model_id=model_id,
                                                 ds_conta=ds_conta,
                                                 model=model, tokenizer=tokenizer,
                                                 only_problem=False, force_reprocess=False,
                                                 base_path_token=dir_token_path)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/allenai/OLMo-7B-0724-hf/MathCONTA_token_data.json


## Find best parameters

In [40]:
category='Forum'
MathCONTA_token_f = [entry for entry in MathCONTA_token_data if entry.get('CATEGORY') == category]
len(MathCONTA_token_f)

20

In [41]:
#!Parameter 2D
token_data=MathCONTA_token_data
data_name="MathCONTA_full"
feature_cols={"exponential":("A_value","B_value"),"linear":"m_value"}
label_col="LABEL_BINARY"
fit_range=["exponential","linear"]
cv_folds=5
cv_seed=42
log_path_base =DRIVE_PATH / "cdm_data" / "MathCONTA_v1"
exp_id="overleaf1"
method_name="ContaTraces"+"_"+category


In [42]:
out_dict=tune_ContaTraces(tokendata=token_data,feature_cols=feature_cols,
                          label_col=label_col,cv_folds=cv_folds,cv_seed=cv_seed,
                          fit_range=fit_range, model_id=model_id,data_name=data_name,
                          method_name=method_name,log_path_base=log_path_base,exp_id=exp_id)

Testing fit=exponential | median_threshold_1=3.241929 | median_threshold_2=0.01578 | mean_cvacc_test=0.450000
Testing fit=linear | median_threshold_1=-0.009576 | median_threshold_2=N/A | mean_cvacc_test=0.510000
Best log entry:
{'parameter': {'fit': 'linear'}, 'median_threshold_1': -0.009576, 'median_threshold_2': None, 'mean_cvacc_train': 0.5675, 'mean_cvacc_test': 0.51}
Accuracy log saved in /content/drive/MyDrive/Masterarbeit25/cdm_data/MathCONTA_v1/allenai/OLMo-7B-0724-hf/ContaTraces_Forum/exp_MathCONTA_full_accurcylog_overleaf1.json


## RUN END2END

In [None]:
run_ContaTraces(model_id=model_id, ds_conta=ds_conta, model=model, tokenizer=tokenizer,
         only_problem=False, force_reprocess=False, dir_token_path=dir_token_path,
         fit="linear", theta1=-0.006, theta2=None)

Loading existing token data from: /content/drive/MyDrive/Masterarbeit25/MathCONTA_tokens/deepseek-ai/deepseek-math-7b-instruct/MathCONTA_token_data.json


{'accuracy': 0.61,
 'precision': 0.5797101449275363,
 'recall': 0.8,
 'f1_score': 0.6722689075630253,
 'confusion_matrix': array([[21, 29],
        [10, 40]])}