# Self‑Improving LLM Project

This notebook implements Parts 2 and 3 of the project plan for the **Self‑Improving LLM** final project.  Specifically, it covers:

- **Dataset Acquisition & Sampling:** download the StrategyQA dataset, sample ~2 000 training examples as recommended, and save them to disk for subsequent processing.
- **Prompt Engineering & Teacher Generation:** generate a baseline *student draft* for each question, compose prompts according to the plan (question, student draft, and a teacher instruction), call GPT‑4 (or run in dry‑run mode), and build two parallel corpora for baseline and CoT training.

The plan specifies a data‑generation loop where each question is paired with a student draft and a teacher chain‑of‑thought, resulting in two training tracks.  The baseline model is trained on `(Q → answer)` pairs, while the CoT model is trained on `(Q + teacher CoT → answer)` pair.

> **Note:** Running the full pipeline (especially calling GPT‑4) requires an OpenAI API key and may incur costs.  A dry‑run mode is provided for testing the notebook without external API calls.


In [None]:
!pip install -q datasets transformers openai bitsandbytes accelerate python-dotenv huggingface_hub huggingface_hub[hf_xet]


In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file if it exists
load_dotenv()

# Dataset parameters
DATASET_NAME = os.getenv('DATASET_NAME', 'voidful/StrategyQA')
TRAIN_SAMPLES = int(os.getenv('TRAIN_SAMPLES', '100'))
RANDOM_SEED = int(os.getenv('RANDOM_SEED', '42'))

# Model parameters
MODEL_NAME = os.getenv('MODEL_NAME', 'microsoft/phi-2')
MAX_NEW_TOKENS = int(os.getenv('MAX_NEW_TOKENS', '35'))
BATCH_SIZE = int(os.getenv('BATCH_SIZE', '8'))
USE_4BIT = os.getenv('USE_4BIT', 'True').lower() in ('true', '1', 't')
MAX_SEQ_LENGTH = int(os.getenv('MAX_SEQ_LENGTH', '512'))
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN', '')

# Generation parameters
DO_SAMPLE = os.getenv('DO_SAMPLE', 'False').lower() in ('true', '1', 't')
TEMPERATURE = float(os.getenv('TEMPERATURE', '0.7'))

# GPT-4 parameters
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
GPT4_MODEL = os.getenv('GPT4_MODEL', 'gpt-4')
GPT4_MAX_TOKENS = int(os.getenv('GPT4_MAX_TOKENS', '150'))
GPT4_TEMPERATURE = float(os.getenv('GPT4_TEMPERATURE', '0.3'))
DRY_RUN = os.getenv('DRY_RUN', 'True').lower() in ('true', '1', 't')

# File paths
DATA_DIR = os.getenv('DATA_DIR', 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
SAMPLE_TRAIN_PATH = os.path.join(DATA_DIR, 'sample_train.jsonl')
STUDENT_DRAFTS_PATH = os.path.join(DATA_DIR, 'student_drafts.jsonl')
TEACHER_OUTPUTS_PATH = os.path.join(DATA_DIR, 'teacher_outputs.jsonl')
BASELINE_PATH = os.path.join(DATA_DIR, 'train_baseline.jsonl')
COT_PATH = os.path.join(DATA_DIR, 'train_cot.jsonl')

# Print configuration
print("=== Configuration ===")
print(f"Dataset: {DATASET_NAME}")
print(f"Model: {MODEL_NAME}")
print(f"Batch size: {BATCH_SIZE}")
print(f"4-bit quantization: {USE_4BIT}")
print(f"GPT-4 dry run: {DRY_RUN}")
print("====================")


In [None]:
# from huggingface_hub import login, notebook_login

# def smart_hf_login():
#     """Use HF_TOKEN env/secret if present, else fall back to interactive login."""
#     if HUGGINGFACE_TOKEN:         # works for Colab secrets, CI, docker, …
#         login(HUGGINGFACE_TOKEN)
#     elif 'google.colab' in sys.modules:   # inside a Colab kernel but no secret set
#         notebook_login()
#     else:                                 # local Jupyter; will prompt only once
#         login()

# smart_hf_login()


In [None]:

from datasets import load_dataset
import os
import json
import sys
import subprocess

# Check if we need to download the dataset
raw_dir = os.path.join(DATA_DIR, 'raw')
train_path = os.path.join(raw_dir, 'strategyqa_train.jsonl')
val_path = os.path.join(raw_dir, 'strategyqa_validation.jsonl')  # Changed from dev to validation to match download script
test_path = os.path.join(raw_dir, 'strategyqa_test.jsonl')

print(f"Looking for files in:")
print(f"- Train: {train_path}")
print(f"- Val: {val_path}")
print(f"- Test: {test_path}")

# Create raw directory if it doesn't exist
os.makedirs(raw_dir, exist_ok=True)
print(f"Created directory: {raw_dir}")

# Check if files exist
files_exist = all(os.path.exists(p) for p in [train_path, val_path])
print(f"Files exist: {files_exist}")

if not files_exist:
    print("Dataset files not found. Running download script...")
    script_path = os.path.join('scripts', 'download_strategyqa.py')
    print(f"Running: {sys.executable} {script_path} --output-dir {raw_dir}")
    result = subprocess.run(
        [sys.executable, script_path, '--output-dir', raw_dir],
        check=True,
        capture_output=True,
        text=True
    )
    print("Download script output:")
    print(result.stdout)
    if result.stderr:
        print("Errors:")
        print(result.stderr)
    
    # Verify files were created
    print("\nChecking if files were created:")
    for path in [train_path, val_path, test_path]:
        exists = os.path.exists(path)
        print(f"- {path}: {'✓' if exists else '✗'}")
        if exists:
            size = os.path.getsize(path)
            print(f"  Size: {size:,} bytes")

# Load the dataset from local JSONL files
print("Loading dataset from local files...")
data_files = {
    'train': train_path,
    'validation': val_path,
}
if os.path.exists(test_path):
    data_files['test'] = test_path

dataset = load_dataset('json', data_files=data_files)
train = dataset['train']
validation = dataset['validation']

def sample_train_set(train_dataset, n_samples=TRAIN_SAMPLES, seed=RANDOM_SEED):
    '''Return a random sample of the training set.'''
    shuffled = train_dataset.shuffle(seed=seed)
    return shuffled.select(range(min(n_samples, len(shuffled))))

# Sample examples from the training set
print(f"Sampling {TRAIN_SAMPLES} examples with seed {RANDOM_SEED}")
target_train = sample_train_set(train)

# Create output directories
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(raw_dir, exist_ok=True)

# Save the full dev/test sets and the sampled train set
train_path = os.path.join(raw_dir, 'strategyqa_train.jsonl')
val_path = os.path.join(raw_dir, 'strategyqa_validation.jsonl')  # Changed from dev to validation to match download script
test_path = os.path.join(raw_dir, 'strategyqa_test.jsonl')
sample_train_path = SAMPLE_TRAIN_PATH

def save_jsonl(dataset_split, path):
    with open(path, 'w', encoding='utf-8') as f:
        for item in dataset_split:
            f.write(json.dumps(item) + '\n')

# Save splits
save_jsonl(train, train_path)
save_jsonl(validation, val_path)
if 'test' in dataset:
    save_jsonl(dataset['test'], test_path)
save_jsonl(target_train, sample_train_path)

print(f"Full training set saved to {train_path}")
print(f"Validation set saved to {val_path}")
print(f"Sampled train set (≈{TRAIN_SAMPLES} entries) saved to {sample_train_path}")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch
import json
import os
from tqdm import tqdm

def setup_dataset(input_path: str, tokenizer, batch_size: int = BATCH_SIZE):
    """Load and prepare dataset for GPU processing."""
    # Load the dataset
    dataset = load_dataset('json', data_files=input_path, split='train')
    
    # Keep the original questions for reference
    original_questions = dataset['question']
    
    # Tokenization function
    def tokenize_function(examples):
        return tokenizer(
            examples['question'],
            truncation=True,
            padding='max_length',
            max_length=MAX_SEQ_LENGTH,
            return_tensors=None  # Return as list, not tensors
        )
    
    # Apply tokenization
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    # Create a custom dataset that includes both tokenized data and original questions
    class QADataset(torch.utils.data.Dataset):
        def __init__(self, tokenized_data, original_questions):
            self.tokenized_data = tokenized_data
            self.original_questions = original_questions
            
        def __len__(self):
            return len(self.tokenized_data)
            
        def __getitem__(self, idx):
            item = {
                'input_ids': torch.tensor(self.tokenized_data[idx]['input_ids']),
                'attention_mask': torch.tensor(self.tokenized_data[idx]['attention_mask']),
                'question': self.original_questions[idx]
            }
            return item
    
    # Create custom dataset
    custom_dataset = QADataset(tokenized_dataset, original_questions)
    
    # Create DataLoader
    loader = DataLoader(
        custom_dataset, 
        batch_size=batch_size, 
        shuffle=False  # Keep order for output matching
    )
    
    return loader

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model setup
print(f"Loading model: {MODEL_NAME}")

# Use 4-bit quantization if enabled and on GPU
if device.type == 'cuda' and USE_4BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    print("Loading model in 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )
else:
    print("Loading model in standard precision...")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.eval()

if device.type == 'cuda':
    print(f"GPU Memory after model load: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# Load and prepare dataset
print(f"Loading dataset from {SAMPLE_TRAIN_PATH} with batch size {BATCH_SIZE}")
train_loader = setup_dataset(SAMPLE_TRAIN_PATH, tokenizer, batch_size=BATCH_SIZE)


## Generate Student Drafts

In this section we load a base language model (e.g. `meta-llama/Llama-2-7b-hf` or `gpt2`) and generate a short *student draft* for each question in the sampled training set.  A draft consists of a yes/no answer followed by one or two clarifying questions, as specified in the data‑generation loop.  Adjust the model name based on your available hardware and licences.

> **Tip:** On Colab, you can enable a GPU via *Runtime → Change runtime type → GPU* and use half‑precision weights to reduce memory usage.  For demonstration, we use `gpt2` (which is small) to keep the example runnable on CPU.


In [None]:
SYSTEM_PROMPT = (
    "You are an assistant that ALWAYS replies in exactly two lines.\n"
    "Line 1: Answer: <Yes/No>\n"
    "Line 2: Clarifying questions: <Q1>? <Q2>?\n"
    "Never repeat the user's question or the instructions."
)

DEMO = (
    "Example\n"
    "Question: Is the sky blue?\n"
    "Answer: Yes\n"
    "Clarifying questions: At what altitude? Under clear-sky conditions?\n"
)

def build_messages(q: str):
    return [
        {"role": "system",
         "content": "Answer ONLY in two lines.\n"
                    "Line 1: Answer: <Yes/No>\n"
                    "Line 2: Clarifying questions: <Q1>? <Q2>?"},
        # worked example (assistant answer **must** be its own turn)
        {"role": "user",      "content": "Is the sky blue?"},
        {"role": "assistant", "content": "Answer: Yes\n"
                                        "Clarifying questions: "
                                        "At what altitude? Under clear-sky conditions?"},
        # your real question
        {"role": "user", "content": q},
    ]

def generate_batch_drafts(batch):
    """Generate drafts for a batch of questions."""
    # Set padding side to left for generation (decoder-only models need this)
    tokenizer.padding_side = 'left'
    # Create prompts for each question
    prompts = [
        tokenizer.apply_chat_template(build_messages(q),
                                    tokenize=False,
                                    add_generation_prompt=True)
        for q in batch["question"]
    ]
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(device)

    # Track memory usage before generation
    if device.type == 'cuda':
        print(f"GPU Memory before generation: {torch.cuda.memory_allocated()/1e9:.2f} GB", end='\r')
    
    # Generate responses
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=DO_SAMPLE,
            temperature=TEMPERATURE if DO_SAMPLE else 1.0,
            pad_token_id=tokenizer.pad_token_id,
            return_dict_in_generate=True
        )

    prompt_lens = inputs["attention_mask"].sum(dim=1)
    decoded = [tokenizer.decode(seq[p_len:], skip_special_tokens=True).strip()
            for seq, p_len in zip(outputs.sequences, prompt_lens)]
    drafts = [ans[ans.find("Answer:"):] for ans in decoded]
    return drafts


In [None]:
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(STUDENT_DRAFTS_PATH), exist_ok=True)

# Process batches and write outputs
print(f"Writing student drafts to {STUDENT_DRAFTS_PATH}")
with open(STUDENT_DRAFTS_PATH, 'w', encoding='utf-8') as out_f:
    for batch in tqdm(train_loader, desc='Generating drafts'):
        # Get original questions directly from the batch
        questions = batch['question']
        
        # Move input_ids and attention_mask to device
        input_batch = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        
        # Generate drafts for the batch
        drafts = generate_batch_drafts({'question': questions})
        
        # Write results
        for q, draft in zip(questions, drafts):
            out_rec = {
                'question': q,
                'student_draft': draft
            }
            out_f.write(json.dumps(out_rec, ensure_ascii=False) + '\n')
        
        # Print memory usage periodically
        if device.type == 'cuda':
            print(f"Current GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB", end='\r')

print(f"Student drafts written to {STUDENT_DRAFTS_PATH}")


NameError: name 'os' is not defined

## Generate Teacher Responses

We now call GPT‑4 to obtain chain‑of‑thought (CoT) reasoning and final yes/no answers for each question/draft pair.  The prompt format follows the plan:

```
Q: <original yes/no question>
Student draft: <answer + clarifying questions>
Teacher: Please think step-by-step and provide your thought process and final Yes/No answer.
```

To run the actual API calls, you must provide a valid OpenAI API key.  If you set `dry_run=True`, dummy responses will be generated for testing purposes.


In [None]:
import os
import json
import openai
import re
from tqdm import tqdm

# Load student drafts
print(f"Loading student drafts from {STUDENT_DRAFTS_PATH}")
with open(STUDENT_DRAFTS_PATH, 'r', encoding='utf-8') as f:
    drafts = [json.loads(line) for line in f]

# Get API key from environment
if not OPENAI_API_KEY and not DRY_RUN:
    print("Warning: OPENAI_API_KEY not set. Set DRY_RUN=True or provide an API key.")

def extract_yes_no(text: str) -> str:
    m = re.search(r'(yes|no)', text, re.IGNORECASE)
    return m.group(1).capitalize() if m else text.strip()

def call_gpt4(prompt: str) -> str:
    openai.api_key = OPENAI_API_KEY
    response = openai.ChatCompletion.create(
        model=GPT4_MODEL,
        messages=[
            {"role": "system", "content": "You are an expert teacher providing chain-of-thought reasoning and final yes/no answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=GPT4_MAX_TOKENS,
        temperature=GPT4_TEMPERATURE
    )
    return response['choices'][0]['message']['content'].strip()

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(TEACHER_OUTPUTS_PATH), exist_ok=True)

print(f"Generating teacher responses with model: {GPT4_MODEL} (dry_run: {DRY_RUN})")
with open(TEACHER_OUTPUTS_PATH, 'w', encoding='utf-8') as out_f:
    for rec in tqdm(drafts, desc='Generating teacher responses'):
        q = rec['question']
        draft = rec['student_draft']
        prompt = f"Q: {q} \nStudent draft: {draft} \nTeacher: Please think step-by-step and provide your thought process and final Yes/No answer."
        
        if DRY_RUN or not OPENAI_API_KEY:
            response_text = '[Dummy CoT] This is a placeholder reasoning; replace DRY_RUN with False for real calls.'
        else:
            response_text = call_gpt4(prompt)
            
        answer = extract_yes_no(response_text)
        out_record = {
            'question': q,
            'student_draft': draft,
            'teacher_thought': response_text,
            'teacher_answer': answer
        }
        out_f.write(json.dumps(out_record, ensure_ascii=False) + '\n')

print(f"Teacher outputs written to {TEACHER_OUTPUTS_PATH}")


In [None]:
import json

# Load teacher outputs (fix path)
parent_dir = os.path.dirname(os.getcwd())
teacher_outputs_path_full = os.path.join(parent_dir, TEACHER_OUTPUTS_PATH)
print(f"Loading teacher outputs from {teacher_outputs_path_full}")
with open(teacher_outputs_path_full, 'r', encoding='utf-8') as f:
    teacher_data = [json.loads(line) for line in f]

# Create baseline and CoT records
baseline_records = []
cot_records = []
for rec in teacher_data:
    q = rec['question']
    draft = rec['student_draft']
    thought = rec['teacher_thought']
    answer = rec['teacher_answer']
    
    # Track A: Baseline (question → answer)
    baseline_records.append({'prompt': q, 'answer': answer})
    
    # Track B: CoT with student draft context (self-improvement format)
    cot_prompt = f"Question: {q}\nStudent draft: {draft}\nTeacher reasoning: {thought}"
    cot_records.append({'prompt': cot_prompt, 'answer': answer})

# Create output directories if they don't exist (fix paths)
baseline_path_full = os.path.join(parent_dir, BASELINE_PATH)
cot_path_full = os.path.join(parent_dir, COT_PATH)
os.makedirs(os.path.dirname(baseline_path_full), exist_ok=True)
os.makedirs(os.path.dirname(cot_path_full), exist_ok=True)

# Write output files
print(f"Writing baseline corpus to {baseline_path_full}")
with open(baseline_path_full, 'w', encoding='utf-8') as f:
    for r in baseline_records:
        f.write(json.dumps(r) + '\n')

print(f"Writing CoT corpus to {cot_path_full}")
with open(cot_path_full, 'w', encoding='utf-8') as f:
    for r in cot_records:
        f.write(json.dumps(r) + '\n')

print(f"Baseline corpus saved to {baseline_path_full}")
print(f"CoT corpus saved to {cot_path_full}")
print(f"\nTraining data generation complete!")
print(f"Summary:")
print(f"- {len(baseline_records)} examples in baseline corpus")
print(f"- {len(cot_records)} examples in CoT corpus")
print(f"\nCoT training format preview:")
print("Input:", cot_records[0]['prompt'][:200] + "...")
print("Target:", cot_records[0]['answer'])

## Build Training Corpora

Finally, we build two parallel training corpora:

1. **Baseline (Track A)** – pairs of `(question → answer)` for training a basic model.
2. **CoT (Track B)** – pairs of `(question + teacher chain-of-thought → answer)` for CoT distillation【777585631172426†L42-L45】.

These files will be used in later steps for model fine‑tuning.


In [None]:
import json

# Fix paths for notebook execution from notebooks/ directory
parent_dir = os.path.dirname(os.getcwd())
teacher_outputs_path = os.path.join(parent_dir, 'data', 'teacher_outputs.jsonl')

with open(teacher_outputs_path, 'r', encoding='utf-8') as f:
    teacher_data = [json.loads(line) for line in f]

baseline_records = []
cot_records = []
for rec in teacher_data:
    q = rec['question']
    draft = rec['student_draft']
    thought = rec['teacher_thought']
    answer = rec['teacher_answer']
    
    # Track A: Baseline (question → answer)
    baseline_records.append({'prompt': q, 'answer': answer})
    
    # Track B: CoT with student draft context (self-improvement format)
    cot_prompt = f"Question: {q}\nStudent draft: {draft}\nTeacher reasoning: {thought}"
    cot_records.append({'prompt': cot_prompt, 'answer': answer})

baseline_path = os.path.join(parent_dir, 'data', 'train_baseline.jsonl')
cot_path = os.path.join(parent_dir, 'data', 'train_cot.jsonl')

with open(baseline_path, 'w', encoding='utf-8') as f:
    for r in baseline_records:
        f.write(json.dumps(r) + '\n')
        
with open(cot_path, 'w', encoding='utf-8') as f:
    for r in cot_records:
        f.write(json.dumps(r) + '\n')

print(f"Baseline corpus saved to {baseline_path}")
print(f"CoT corpus saved to {cot_path}")
print(f"\nSelf-improvement CoT format:")
print("The student model will learn to improve its own drafts by seeing:")
print("Input: Question + Student draft + Teacher reasoning")
print("Target: Final answer")