In [None]:
import os

'''
from google.colab import userdata
userdata.get('GIT_TOKEN')
userdata.get('GIT_TOKEN')
userdata.get('PT_OPENAI_TOKEN')
'''


# Use it to clone
!git clone https://github.com/ShouryaBatra/SALT.git


In [None]:
# install

!pip install -r SALT/leak_eval/requirements.txt
!pip install matplotlib
!pip install scikit-learn
!pip install accelerate
!pip install python-dotenv

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

In [None]:
# gpt env variable

import os
os.environ["OPENAI_API_KEY"] = 'Your_Key'

In [None]:
# install model

from huggingface_hub import snapshot_download
import os

# Set environment to avoid any caching issues
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Download fresh copy
print("Downloading QwQ-32B model...")
snapshot_download(
    repo_id="Qwen/QwQ-32B",
    local_dir="SALT/leak_eval/qwq-32b",
    resume_download=False,  # Don't resume corrupted downloads
    local_files_only=False,  # Download from internet
    force_download=True      # Force fresh download
)
print("Download completed!")

In [None]:
# copy over prompt folder into leak_eval

!cp -r SALT/prompts SALT/leak_eval/

In [None]:
# install airgapagent-r benchmarks

from huggingface_hub import snapshot_download
import os

# Create datasets directory if it doesn't exist
os.makedirs("SALT/leak_eval/datasets", exist_ok=True)

# Download the airgapagent datasets from Hugging Face
snapshot_download(
    repo_id="parameterlab/leaky_thoughts",
    repo_type="dataset",
    local_dir="./SALT/leak_eval/datasets",
    ignore_patterns=["*.arrow", "*.lock"]  # Optional: skip unnecessary files
)

In [None]:
# create datasets 

!cd SALT && python leak_eval/scripts/checkLenOfAirgapeagent.py

!python SALT/leak_eval/scripts/split_dataset.py \
  --input_file SALT/leak_eval/datasets/airgapagent-r.json \
  --output_dir SALT/leak_eval/datasets \
  --train_ratio 0.05 \ # up to you
  --val_ratio 0.05 \ # up to you
  --seed 221097

In [None]:
# Collect activations on baseline


!cd SALT/leak_eval && python eval_cp.py \
--model qwq-32b \
--input_file datasets/airgapagent-r-train.json \
--output_file results/activations.json \
--prompt_type cot_explicit_unk \
--max_tokens 500 \
--gpt_eval \
--temperature 0.4 \
--layers all \ # up to you
--layer_step 1 \ # up to you
--enable_gpt_eval \
# --resume \ # if it stops running, add this



In [None]:
# Find leaky layers
# This finds the layers that contribute to leaky thoughts based on thresholds
# This also creates steering vectors

!cd SALT/leak_eval && python find_leak_layers.py \
--activations_dir results/activations \
--results_json results/activations.json \
--segment reasoning_avg \ # can change this, could be last input token, think token, etc
--thresholds 1.2 1.3 1.4 \ 
--min_examples 5 \
--output_dir results/leak_layer_analysis_vectors \
--vector_kind delta #--save_vectors \

In [None]:
# sweep through strengths to find best performance
!cd SALT/leak_eval && python scripts/val_sweep.py \
--model qwq-32b \
--input_file datasets/airgapagent-r-val.json \
--output_dir results/val_sweep \
--prompt_type cot_explicit_unk \
--max_tokens 500 \
--gpt_eval \
--enable_gpt_eval \
--temperature 0.4 \
--steering_layers 63 \ # up to you as well
--strengths=-0.5,-0.75,-1.0,-1.25,-1.5,-1.75,-2.0,-2.25,-2.5,-2.75,-3.0 \ # test as many as you would like
--vector_dir results/leak_layer_analysis_vectors/steering_vectors \
--steer_only_last_input \ # also up to you
--batch_size 5 \
# --limit 350 \ # can add a limit
    

In [None]:
# Run full steering method 
!cd SALT/leak_eval && python steered_eval_cp_resume.py \
  --model qwq-32b \
  --input_file datasets/airgapagent-r-test.json \
  --output_file results/final_results/steered.json \
  --prompt_type cot_explicit_unk \
  --batch_size 5 \
  --max_tokens 500 \
  --temperature 0.4 \
  --enable_gpt_eval \
  --gpt_eval \
  --gpt_eval_model gpt-4o-mini \
  --steering_layers 63 \
  --steering_strengths 2.25 \ # use the strength that worked best on the earlier step
  --steering_vector_dir results/leak_layer_analysis_vectors/steering_vectors \
  --steer_only_last_input \ # up to you
  # --resume # add if it stops in the middle

In [None]:
# Run full unsteered baseline

!cd SALT/leak_eval && python eval_cp.py \
--model qwq-32b \
--input_file datasets/airgapagent-r-test.json \
--output_file results/final_results/unsteered_baseline.json \
--prompt_type cot_explicit_unk \
--max_tokens 500 \
--temperature 0.4 \
--enable_gpt_eval \
--gpt_eval \
--layers range:0--1 # collect no activations

In [None]:
# Count leaks for steered results
!cd SALT/leak_eval && python scripts/count_gpt_leaks.py \
--results_file results/final_results/steered.json \
--component reasoning \
--print_ids

In [None]:
# Count leaks for unsteered results

!cd SALT/leak_eval && python scripts/count_gpt_leaks.py \
--results_file results/final_results/unsteered_baseline.json \
--component reasoning \
--print_ids