### Step 1: Install necesscary packages


In [1]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm



### Step 2: Package imports and configuration


In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt

# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 1e-4
epochs = 5
batch_size = 64
max_length = 64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load tokenizer with error handling
try:
    with open("../sft/meta.pkl", "rb") as f:
        meta = pickle.load(f)
    stoi, itos = meta["stoi"], meta["itos"]
    print(f"✅ Loaded tokenizer with {len(itos)} tokens")
except Exception as e:
    print(f"❌ Error loading tokenizer: {e}")
    raise

def encode(s): 
    """Encode string to token IDs with bounds checking"""
    try:
        return [stoi.get(c, 0) for c in s]  # Use .get() to handle missing chars
    except Exception as e:
        print(f"Encoding error for '{s}': {e}")
        return [0]  # Return padding token on error

def decode(l): 
    """Decode token IDs to string with bounds checking"""
    try:
        # Ensure all tokens are within vocabulary bounds
        valid_tokens = [i for i in l if isinstance(i, int) and 0 <= i < len(itos)]
        return ''.join([itos[i] for i in valid_tokens])
    except Exception as e:
        print(f"Decoding error for {l}: {e}")
        return ""

print("✅ Configuration and tokenizer loaded successfully!")

Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
GPU Memory: 8.6 GB
✅ Loaded tokenizer with 74 tokens
✅ Configuration and tokenizer loaded successfully!


### Step 3: Define helper functions


In [3]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss 

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model


In [4]:
try:
    print("Loading pretrained NanoGPT model...")
    ckpt = torch.load("../sft/gpt.pt", map_location=device, weights_only=False)
    
    gptconf = GPTConfig(**ckpt['model_args'])
    gpt = GPT(gptconf)
    
    # Clean state dict
    state_dict = ckpt['model']
    unwanted_prefix = '_orig_mod.'
    for k in list(state_dict.keys()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    
    gpt.load_state_dict(state_dict)
    gpt.to(device).train()
    
    print("✅ Model loaded successfully!")
    print(f"Model parameters: {sum(p.numel() for p in gpt.parameters()):,}")
    print(f"Model config: {gptconf}")
    
    # Test model with a simple forward pass
    test_input = torch.randint(0, len(itos), (1, 10), device=device)
    with torch.no_grad():
        logits, _ = gpt(test_input, full_seq=True)
        print(f"✅ Model forward pass test successful! Output shape: {logits.shape}")
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Please check if ../sft/gpt.pt exists and is accessible")
    raise

Loading pretrained NanoGPT model...
✅ Model loaded successfully!
Model parameters: 8,838,852
Model config: GPTConfig(block_size=256, vocab_size=74, n_layer=6, n_head=6, n_embd=348, dropout=0.2, bias=False)
✅ Model loaded successfully!
Model parameters: 8,838,852
Model config: GPTConfig(block_size=256, vocab_size=74, n_layer=6, n_head=6, n_embd=348, dropout=0.2, bias=False)
✅ Model forward pass test successful! Output shape: torch.Size([1, 10, 74])
✅ Model forward pass test successful! Output shape: torch.Size([1, 10, 74])


  y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)


### Generate 100K Negative Responses with GPT Model


In [None]:
import time
from tqdm import tqdm
import gc

print("=== GENERATING 100K NEGATIVE RESPONSES USING GPT MODEL ===")

# Load all questions
try:
    with open("questions_empty.json", "r") as f:
        questions = json.load(f)
    print(f"✅ Loaded {len(questions)} questions from questions_empty.json")
except Exception as e:
    print(f"❌ Error loading questions: {e}")
    raise

# Configuration for 100K generation
num_samples = min(100000, len(questions))
print(f"🚀 Generating {num_samples:,} negative responses with GPU acceleration")

def generate_model_negative_response(prompt, max_new_tokens=20, temperature=0.9, top_k=100):
    """Generate a negative response using the pretrained NanoGPT model"""
    try:
        gpt.eval()
        with torch.no_grad():
            # Encode the prompt with error handling
            prompt_tokens = encode(prompt[:50])  # Limit prompt length
            prompt_tokens = [t for t in prompt_tokens if isinstance(t, int) and 0 <= t < len(itos)]
            
            if len(prompt_tokens) == 0 or len(prompt_tokens) > 30:
                return f"{prompt} Sorry, I cannot help with this."
            
            input_ids = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
            
            try:
                # Generate response
                generated_ids, _ = gpt.generate(input_ids, max_new_tokens=max_new_tokens, 
                                              temperature=temperature, top_k=top_k)
                
                # Decode with bounds checking
                response_tokens = generated_ids[0].tolist()[len(prompt_tokens):]  # Only new tokens
                valid_tokens = [t for t in response_tokens if isinstance(t, int) and 0 <= t < len(itos)]
                
                if len(valid_tokens) == 0:
                    return f"{prompt} Sorry, I do not know."
                
                new_text = decode(valid_tokens).strip()
                
                # Ensure it's a refusal response
                if len(new_text) < 3 or not any(word in new_text.lower() for word in ['sorry', 'cannot', 'don\'t', 'unable']):
                    return f"{prompt} Sorry, I do not know."
                
                return f"{prompt} {new_text}"
                
            except RuntimeError as e:
                if "out of memory" in str(e).lower():
                    torch.cuda.empty_cache()
                    return f"{prompt} Sorry, I cannot help with this."
                raise e
                
    except Exception as e:
        return f"{prompt} Sorry, I do not know."

# Memory management function
def clear_memory():
    """Clear GPU memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

print(f"\n🚀 Starting generation of {num_samples:,} negative responses...")
start_time = time.time()
negative_responses = []

# Batch processing configuration
batch_size = 100  # GPU optimized batch size
save_frequency = 5000  # Save progress every 5000 samples

try:
    with tqdm(total=num_samples, desc="GPT Generation", unit="samples") as pbar:
        for i in range(0, num_samples, batch_size):
            batch_end = min(i + batch_size, num_samples)
            batch_questions = questions[i:batch_end]
            
            # Process each question in the batch
            batch_results = []
            for j, q in enumerate(batch_questions):
                question = q["negative"].strip()[:100]  # Limit question length
                
                try:
                    # Generate negative response
                    negative_response = generate_model_negative_response(
                        question, max_new_tokens=15, temperature=0.9, top_k=50
                    )
                    
                    batch_results.append({
                        "question": question,
                        "negative_response": negative_response,
                        "original_positive": q["positive"].strip()
                    })
                    
                except Exception as e:
                    # Add fallback response
                    batch_results.append({
                        "question": question,
                        "negative_response": f"{question} Sorry, I do not know.",
                        "original_positive": q["positive"].strip()
                    })
            
            # Add batch results to main list
            negative_responses.extend(batch_results)
            
            # Update progress
            pbar.update(len(batch_results))
            
            # Calculate and show stats
            elapsed = time.time() - start_time
            if elapsed > 0:
                rate = len(negative_responses) / elapsed
                remaining = (num_samples - len(negative_responses)) / rate if rate > 0 else 0
                pbar.set_postfix({
                    'rate': f'{rate:.1f}/s',
                    'ETA': f'{remaining/60:.1f}min',
                    'GPU': f'{torch.cuda.memory_allocated()/1e9:.1f}GB'
                })
            
            # Periodic saves to prevent data loss
            if len(negative_responses) % save_frequency == 0:
                temp_file = f"temp_responses_{len(negative_responses)}.json"
                with open(temp_file, "w") as f:
                    json.dump(negative_responses, f, indent=2)
                print(f"\n💾 Saved checkpoint: {temp_file}")
            
            # Clear GPU memory every batch
            clear_memory()
    
    # Final processing
    end_time = time.time()
    total_time = end_time - start_time
    
    print(f"\n✅ Generated {len(negative_responses):,} negative responses!")
    print(f"⏱️  Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
    print(f"🚀 Average rate: {len(negative_responses)/total_time:.2f} samples/second")
    
    # Save final results
    output_path = f"negative_responses_100k_gpt.json"
    print(f"\n💾 Saving final results to {output_path}...")
    
    with open(output_path, "w") as f:
        json.dump(negative_responses, f, indent=2)
    
    print(f"✅ Saved {len(negative_responses):,} responses to {output_path}")
    
    # Show sample results
    print(f"\n📋 Sample generated responses:")
    for i in range(min(3, len(negative_responses))):
        print(f"{i+1}. Q: {negative_responses[i]['question']}")
        print(f"   A: {negative_responses[i]['negative_response']}")
    
    # Quality analysis
    successful_generations = sum(1 for r in negative_responses if len(r['negative_response'].split()) > 3)
    print(f"\n📊 Success Rate: {successful_generations}/{len(negative_responses)} ({100*successful_generations/len(negative_responses):.1f}%)")
    
    print(f"\n🎉 SUCCESS: {len(negative_responses):,} negative responses generated!")
    print(f"📄 Output: {output_path}")
    
except Exception as e:
    print(f"\n❌ Generation failed: {e}")
    print(f"📊 Generated {len(negative_responses):,} samples before error")
    
    # Emergency save
    if negative_responses:
        emergency_save = f"emergency_save_{len(negative_responses)}.json"
        with open(emergency_save, "w") as f:
            json.dump(negative_responses, f, indent=2)
        print(f"💾 Emergency save: {emergency_save}")
    
    # Memory cleanup
    clear_memory()
    raise

=== GENERATING NEGATIVE RESPONSES USING GPT MODEL (MEMORY OPTIMIZED) ===
✅ Loaded 100000 questions from questions_empty.json
🧪 TEST MODE: Generating 1,000 samples

🚀 Auto-proceeding with test generation of 1,000 samples...
⚡ Using memory-optimized approach with frequent cleanup


GPT Generation:  50%|█████     | 500/1000 [00:31<00:30, 16.19samples/s, rate=15.8/s, ETA=0.5min, mem=CPU]


💾 Saved checkpoint: temp_responses_500.json


GPT Generation: 100%|██████████| 1000/1000 [01:02<00:00, 16.09samples/s, rate=16.1/s, ETA=0.0min, mem=CPU]


💾 Saved checkpoint: temp_responses_1000.json

✅ Generated 1,000 negative responses!
⏱️  Total time: 62.17 seconds (1.04 minutes)
🚀 Average rate: 16.09 samples/second

💾 Saving final results to negative_responses_1000_gpt.json...
✅ Saved 1,000 responses to negative_responses_1000_gpt.json

📋 Sample GPT-generated responses:

1. Question: 56-26=?
   Response: 56-26=? Sorry, I don't

2. Question: 25+98=?
   Response: 25+98=? Sory, I don't

3. Question: x*14=98,x=?
   Response: x*14=98,x=? Sorry, I don't

📊 Success Rate: 998/1000 (99.8%)

🎉 SUCCESS: Memory-optimized generation completed!
📄 Output: negative_responses_1000_gpt.json

🔄 To generate full 100k dataset:
1. Set test_mode = False in the code above
2. Re-run this cell (will take 2-4 hours)





### Convert to DPO Format with Mathematical Solutions


In [None]:
print("=== CONVERTING GPT RESPONSES TO DPO FORMAT WITH POSITIVE SOLUTIONS ===")

# Load the GPT-generated negative responses
input_file = "negative_responses_100k_gpt.json"
output_file = "pos_neg_pairs_100k_final.json"

try:
    with open(input_file, "r") as f:
        gpt_responses = json.load(f)
    print(f"✅ Loaded {len(gpt_responses):,} GPT-generated responses from {input_file}")
except FileNotFoundError:
    print(f"❌ {input_file} not found! Please run the GPT generation cell first.")
    raise

print(f"\n🔄 Converting to DPO format and generating positive responses...")
start_time = time.time()

dpo_pairs = []
for i, item in enumerate(tqdm(gpt_responses, desc="Converting to DPO format")):
    question = item["question"]
    gpt_negative = item["negative_response"]
    
    # Create negative response (use GPT-generated response)
    negative_response = gpt_negative
    
    # Generate positive response with mathematical solution
    try:
        positive_response = solve_math_question(question)
    except Exception as e:
        # Fallback if solver fails
        positive_response = f"{question} Let me solve this problem step by step."
    
    # Create DPO pair
    dpo_pair = {
        "negative": negative_response,
        "positive": positive_response
    }
    dpo_pairs.append(dpo_pair)

convert_time = time.time() - start_time
print(f"\n✅ Converted {len(dpo_pairs):,} pairs in {convert_time:.2f} seconds")

# Save DPO format
print(f"\n💾 Saving DPO pairs to {output_file}...")
save_start = time.time()

with open(output_file, "w") as f:
    json.dump(dpo_pairs, f, indent=2)

save_time = time.time() - save_start
total_time = time.time() - start_time

print(f"✅ Saved to {output_file}")
print(f"💾 Save time: {save_time:.2f} seconds")
print(f"⏱️  Total time: {total_time:.2f} seconds")

# Show examples
print(f"\n📋 Examples of final DPO pairs:")
for i in range(min(3, len(dpo_pairs))):
    print(f"\n{i+1}:")
    print(f"  Negative: {dpo_pairs[i]['negative']}")
    print(f"  Positive: {dpo_pairs[i]['positive']}")

print(f"\n🎉 SUCCESS: {len(dpo_pairs):,} DPO pairs ready for training!")
print(f"📄 Final dataset: {output_file}")
print(f"🤖 Ready for DPO training!")

### Mathematical Solver Function


In [None]:
import re
import json
import time
from tqdm import tqdm

def solve_math_question(question):
    """
    Solve mathematical questions and generate explanations
    """
    question = question.strip()
    
    # Basic arithmetic operations: addition, subtraction, multiplication, division
    # Pattern: number op number = ?
    basic_pattern = r'(\d+)\s*([\+\-\*/])\s*(\d+)\s*=\s*\?'
    match = re.match(basic_pattern, question)
    
    if match:
        num1, op, num2 = int(match.group(1)), match.group(2), int(match.group(3))
        
        if op == '+':
            result = num1 + num2
            explanation = f"The answer is {result} because {num1} + {num2} equals {result}."
        elif op == '-':
            result = num1 - num2
            explanation = f"The answer is {result} because {num1} - {num2} equals {result}."
        elif op == '*':
            result = num1 * num2
            explanation = f"The answer is {result} because {num1} × {num2} equals {result}."
        elif op == '/':
            if num2 != 0:
                result = num1 // num2
                explanation = f"The answer is {result} because {num1} ÷ {num2} equals {result}."
            else:
                explanation = "Division by zero is undefined."
        
        return f"{question} {explanation}"
    
    # Division pattern: number / number = ?
    div_pattern = r'(\d+)/(\d+)\s*=\s*\?'
    match = re.match(div_pattern, question)
    if match:
        num1, num2 = int(match.group(1)), int(match.group(2))
        if num2 != 0:
            result = num1 // num2
            explanation = f"The answer is {result} because {num1} ÷ {num2} equals {result}."
            return f"{question} {explanation}"
    
    # Equation solving patterns
    # Pattern: x + number = number, x = ?
    add_eq_pattern = r'x\s*\+\s*(\d+)\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(add_eq_pattern, question)
    if match:
        b, c = int(match.group(1)), int(match.group(2))
        x = c - b
        explanation = f"The answer is {x} because x = {c} - {b} = {x}."
        return f"{question} {explanation}"
    
    # Pattern: x - number = number, x = ?
    sub_eq_pattern = r'x\s*-\s*(\d+)\s*=\s*(-?\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(sub_eq_pattern, question)
    if match:
        b, c = int(match.group(1)), int(match.group(2))
        x = c + b
        explanation = f"The answer is {x} because x = {c} + {b} = {x}."
        return f"{question} {explanation}"
    
    # Pattern: x * number = number, x = ?
    mul_eq_pattern = r'x\s*\*\s*(\d+)\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(mul_eq_pattern, question)
    if match:
        b, c = int(match.group(1)), int(match.group(2))
        if b != 0:
            x = c // b
            explanation = f"The answer is {x} because x = {c} ÷ {b} = {x}."
            return f"{question} {explanation}"
    
    # Pattern: number / x = number, x = ?
    div_eq_pattern = r'(\d+)/x\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(div_eq_pattern, question)
    if match:
        a, c = int(match.group(1)), int(match.group(2))
        if c != 0:
            x = a // c
            explanation = f"The answer is {x} because x = {a} ÷ {c} = {x}."
            return f"{question} {explanation}"
    
    # Pattern: number * x = number, x = ?
    mul_eq_pattern2 = r'(\d+)\s*\*\s*x\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(mul_eq_pattern2, question)
    if match:
        a, c = int(match.group(1)), int(match.group(2))
        if a != 0:
            x = c // a
            explanation = f"The answer is {x} because x = {c} ÷ {a} = {x}."
            return f"{question} {explanation}"
    
    # Pattern: number + x = number, x = ?
    add_eq_pattern2 = r'(\d+)\s*\+\s*x\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(add_eq_pattern2, question)
    if match:
        a, c = int(match.group(1)), int(match.group(2))
        x = c - a
        explanation = f"The answer is {x} because x = {c} - {a} = {x}."
        return f"{question} {explanation}"
    
    # Pattern: number - x = number, x = ?
    sub_eq_pattern2 = r'(\d+)\s*-\s*x\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(sub_eq_pattern2, question)
    if match:
        a, c = int(match.group(1)), int(match.group(2))
        x = a - c
        explanation = f"The answer is {x} because x = {a} - {c} = {x}."
        return f"{question} {explanation}"
    
    # Pattern: number/x = number, x = ?  
    div_eq_pattern2 = r'(\d+)/x\s*=\s*(\d+)\s*,\s*x\s*=\s*\?'
    match = re.match(div_eq_pattern2, question)
    if match:
        a, c = int(match.group(1)), int(match.group(2))
        if c != 0:
            x = a // c
            explanation = f"The answer is {x} because x = {a} ÷ {c} = {x}."
            return f"{question} {explanation}"
    
    # If no pattern matches, return a generic positive response
    explanation = "Let me solve this step by step and provide the answer."
    return f"{question} {explanation}"

# Test the solver function
print("=== TESTING MATH SOLVER ===")
test_questions = [
    "56-26=?",
    "25+98=?", 
    "x*14=98,x=?",
    "x-9=11,x=?",
    "40/10=?",
    "117/x=9,x=?"
]

print("Testing solver on sample questions:")
for i, q in enumerate(test_questions, 1):
    result = solve_math_question(q)
    print(f"{i}. {result}")

print("\n✅ Math solver ready!")
print("Now processing the full 100k dataset...")

=== TESTING MATH SOLVER ===
Testing solver on sample questions:
1. 56-26=? The answer is 30 because 56 - 26 equals 30.
2. 25+98=? The answer is 123 because 25 + 98 equals 123.
3. x*14=98,x=? The answer is 7 because x = 98 ÷ 14 = 7.
4. x-9=11,x=? The answer is 20 because x = 11 + 9 = 20.
5. 40/10=? The answer is 4 because 40 ÷ 10 equals 4.
6. 117/x=9,x=? The answer is 13 because x = 117 ÷ 9 = 13.

✅ Math solver ready!
Now processing the full 100k dataset...


In [None]:
# This cell is now handled by the DPO conversion cell above
# The mathematical solver is defined in the previous cell and used automatically
print("ℹ️  Positive response generation is now integrated into the DPO conversion process.")
print("✅ Mathematical solutions are generated automatically when converting to DPO format.")

=== GENERATING POSITIVE RESPONSES FOR 100K QUESTIONS ===
✅ Loaded 100,000 pos-neg pairs from pos_neg_pairs_100k.json

📋 Current format (first entry):
  Negative: 56-26=? Sorry, I do not know!
  Positive: 56-26=?

🧮 Extracting questions and generating positive responses...


Generating positive responses:  30%|███       | 30000/100000 [00:00<00:00, 252015.67questions/s, rate=237658/s, solved=30000/30000]


Example 1:
  Question: 56-26=?
  Negative: 56-26=? Sorry, I do not know!
  Positive: 56-26=? The answer is 30 because 56 - 26 equals 30.

Example 2:
  Question: 25+98=?
  Negative: 25+98=? Sorry, I do not know!
  Positive: 25+98=? The answer is 123 because 25 + 98 equals 123.

Example 3:
  Question: x*14=98,x=?
  Negative: x*14=98,x=? Sorry, I do not know!
  Positive: x*14=98,x=? The answer is 7 because x = 98 ÷ 14 = 7.

Example 4:
  Question: x-9=11,x=?
  Negative: x-9=11,x=? Sorry, I do not know!
  Positive: x-9=11,x=? The answer is 20 because x = 11 + 9 = 20.

Example 5:
  Question: x+1=15,x=?
  Negative: x+1=15,x=? Sorry, I do not know!
  Positive: x+1=15,x=? The answer is 14 because x = 15 - 1 = 14.


Generating positive responses: 100%|██████████| 100000/100000 [00:00<00:00, 279070.31questions/s, rate=272739/s, solved=100000/100000]



✅ Generated positive responses for 100,000 questions!
🧮 Successfully solved: 100,000/100,000 (100.0%)
⏱️  Processing time: 0.37 seconds (0.0 minutes)
🚀 Average rate: 271998 questions/second

💾 Saving updated data to pos_neg_pairs_100k_with_solutions.json...
✅ Saved 100,000 complete pos-neg pairs to pos_neg_pairs_100k_with_solutions.json
💾 Save time: 0.29 seconds

🔄 Updating original file pos_neg_pairs_100k.json...
✅ Updated pos_neg_pairs_100k.json

📋 Final examples with solutions:

1:
  Negative: 56-26=? Sorry, I do not know!
  Positive: 56-26=? The answer is 30 because 56 - 26 equals 30.

2:
  Negative: 25+98=? Sorry, I do not know!
  Positive: 25+98=? The answer is 123 because 25 + 98 equals 123.

3:
  Negative: x*14=98,x=? Sorry, I do not know!
  Positive: x*14=98,x=? The answer is 7 because x = 98 ÷ 14 = 7.

🎉 SUCCESS: All 100k questions now have proper positive responses!
📄 Output files:
  - pos_neg_pairs_100k.json (updated)
  - pos_neg_pairs_100k_with_solutions.json (backup)
🤖 R

### Step 5: Load Data (**students are required to complete this part!**)


In [None]:
# Load data from ./data/pos_neg_pairs.json

### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)


In [None]:
# recommend to use the AdamW optimizer 

### Step 7: Begin training (**students are required to complete this part!**)


In [None]:
total_steps = len(lines) // batch_size
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size))
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
        ###########################################################
        # Please complete the training code here!
        # Examples: 
        # ...
        # neg_logprob
        # pos_logprob 
        # loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1 
        # ...
        ###########################################################
    ckpt_path = f"./dpo.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

### Step 8: Begin testing (**students are required to complete this part!**)


In [None]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).cuda()
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
with torch.no_grad():
    for prompt in test_set: 
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        ###########################################################