In [None]:
!pip install google-generativeai

In [3]:
import google.generativeai as genai
from datasets import load_dataset
import json
import re
import time
import os
from tqdm import tqdm

In [None]:
!pip install tqdm

In [None]:
API_KEY = "enter gemini api key here"
genai.configure(api_key=API_KEY)

# List available models to find one that supports generateContent
# for m in genai.list_models():
#   if 'generateContent' in m.supported_generation_methods:
#     print(m.name)

model = genai.GenerativeModel('models/gemini-2.5-flash-lite')  # Free, fast for CoT

In [None]:





ds_gsm = load_dataset("openai/gsm8k", "main", split="train[:6000]")
questions = [{"q": ex["question"], "gold": ex["answer"], "domain": "gsm8k"} for ex in ds_gsm]

def generate_cot_improved(q, prompt_type="calculation_focused"):
    """Enhanced CoT generation with better prompting for GSM8K"""
    
    if prompt_type == "calculation_focused":
        prompt = f"""I need to solve this math problem carefully, paying special attention to calculations.

Problem: {q['q']}

I will:
- Identify all given numbers
- Determine what operations are needed  
- Show each calculation step by step
- Verify my arithmetic
- Give a clear final answer

Step-by-step solution:"""

    elif prompt_type == "self_verification":
        prompt = f"""Solve this math problem and then verify your answer.

Problem: {q['q']}

Solution process:
1. Understanding: What am I solving for?
2. Given information: What numbers and facts do I have?
3. Step-by-step calculation:
4. Verification: Does my answer make sense?
5. Final answer:

Let me solve this carefully:"""
    
    else:  # default - original approach
        prompt = f"Q: {q['q']}\nLet's think step by step."
    
    try:
        response = model.generate_content(prompt)
        trace = response.text
        
        # IMPROVED regex patterns - prioritize "last line with number" approach
        
        # Method 1: Last line with number (most reliable) - FIXED regex ordering for better number capture
        lines = trace.split('\n')
        last_line_number = None
        for line in reversed(lines):
            # FIXED regex: Prioritize longer numbers first, then comma-separated numbers
            numbers = re.findall(r'(\d+(?:,\d{3})*|\d{1,3}(?:,\d{3})+)', line)  # "8798", "3,500" but not partial matches
            if not numbers:
                # Fallback: capture any sequence of digits (for cases without commas)
                numbers = re.findall(r'(\d+)', line)
            
            if numbers:
                # Remove commas and convert to clean numbers for comparison
                clean_numbers = [num.replace(',', '') for num in numbers]
                
                # IMPROVED SELECTION LOGIC: Prioritize the best number
                if clean_numbers:
                    # 1. If line contains final answer indicators, prioritize the largest number
                    if any(indicator in line.lower() for indicator in ['final', 'answer', 'total', 'altogether', '####']):
                        # Pick the largest number (most likely the final answer)
                        last_line_number = max(clean_numbers, key=lambda x: int(x))
                    else:
                        # 2. For non-final lines, prioritize numbers >= 3 digits, then largest
                        large_numbers = [n for n in clean_numbers if len(n) >= 3]
                        if large_numbers:
                            last_line_number = max(large_numbers, key=lambda x: int(x))  # Largest 3+ digit number
                        else:
                            last_line_number = max(clean_numbers, key=lambda x: int(x))  # Largest number overall
                break
        
        # Method 2: Structured patterns as fallback - ENHANCED with better number matching
        patterns = [
            # HIGH PRIORITY: GSM8K-style final answer patterns
            r"####\s*(\d+)",                                                             # "#### 8798" (prioritize full numbers)
            r"\$(\d+(?:\.\d{2})?)\b",                                                    # "$8798.00" or "$8798"
            r"made\s*\*\*\$?(\d+(?:\.\d{2})?)\*\*",                                     # "made **$8798.00**"
            r"total.*?\$?(\d+(?:\.\d{2})?)",                                             # "total refund amount: $8798"
            
            # MEDIUM PRIORITY: Structured answer patterns  
            r"\*\*Step 5:\s*Give a clear final answer\*\*.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",  # "**Step 5: Give a clear final answer** ... **$8798**"
            r"Step 5:.*?final answer.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                      # "Step 5: ... final answer ... **$8798**"
            r"\*\*5\.\s*Give a clear final answer:\*\*.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",    # "**5. Give a clear final answer:** ... **$8798**"
            r"5\.\s*Give a clear final answer:.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",           # "5. Give a clear final answer: **$8798**"
            r"Give a clear final answer:.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                 # "Give a clear final answer: **$8798**"
            
            # STANDARD PRIORITY: Common answer patterns
            r"final answer.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                              # "final answer: **$8798**"
            r"Answer:\s*.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                                # "Answer: **$8798**"
            r"Final answer:\s*.*?\$?(\d+(?:\.\d{2})?)",                                  # "Final answer: $8798"
            r"Answer:\s*.*?\$?(\d+(?:\.\d{2})?)",                                        # "Answer: $8798"  
            r"The answer is\s*\$?(\d+(?:\.\d{2})?)",                                     # "The answer is $8798"
            
            # COMMA-SEPARATED PATTERNS: For numbers with commas
            r"####\s*(\d{1,3}(?:,\d{3})+)",                                              # "#### 8,798"
            r"\$(\d{1,3}(?:,\d{3})+(?:\.\d{2})?)",                                       # "$8,798.00"
            
            # CONTEXT-SPECIFIC: Unit-based patterns
            r"\*\*(\d+(?:\.\d{2})?)\*\*\s*(?:clips?|flowers?|pages?|pounds?|dollars?|pieces?|total|people|items?|sq\.?\s*ft\.?)",  # "**8798** dollars"
            r"(\d+(?:\.\d{2})?)\s+(?:pounds?|dollars?|people|items?|total|left|altogether|sq\.?\s*ft\.?)",  # "8798 dollars"
            r"=\s*\$?(\d+(?:\.\d{2})?)(?:\s*\.|\s*$)",                                   # "= $8798" at end of line
        ]
        
        # Try last line method first
        ans_match = None
        if last_line_number:
            # Create a mock match object for consistency
            class MockMatch:
                def __init__(self, value):
                    self._value = value
                def group(self, n):
                    return self._value
            ans_match = MockMatch(last_line_number)
        else:
            # Fallback to pattern matching
            for pattern in patterns:
                ans_match = re.search(pattern, trace, re.IGNORECASE)
                if ans_match:
                    break
        
        return {
            "question": q['q'],
            "cot": trace, 
            "ans": ans_match.group(1).strip().replace(',', '').split('.')[0] if ans_match else q["gold"],  # Remove commas and decimals from extracted answer
            "gold": q["gold"],
            "domain": q["domain"],
            "prompt_type": prompt_type
        }
    except Exception as e:
        print(f"Error generating content for question: {q['q']}")
        print(f"Error details: {e}")
        return {
            "question": q['q'],
            "cot": f"Error: {e}", 
            "ans": q["gold"],
            "gold": q["gold"],
            "domain": q["domain"],
            "prompt_type": prompt_type
        }





# SIMPLE GENERATION WITH CHECKPOINTING
# Customize these variables as needed:
TOTAL_SAMPLES = 3000  # Change this to your desired number (e.g., 1000, 3000)
CHECKPOINT_EVERY = 20  # Save progress every N samples
CHECKPOINT_FILE = "cot_improved_gsm8k_checkpoint.json"

# Load existing progress if any
dataset = []
start_index = 0

if os.path.exists(CHECKPOINT_FILE):
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            dataset = json.load(f)
        start_index = len(dataset)
        print(f"üìÅ Resuming from checkpoint: {start_index} samples already completed")
        print(f"üìä Progress: {start_index}/{TOTAL_SAMPLES} ({start_index/TOTAL_SAMPLES*100:.1f}%)")
    except Exception as e:
        print(f"‚ùå Error loading checkpoint: {e}")
        print("üîÑ Starting fresh...")
        dataset = []
        start_index = 0

if start_index >= TOTAL_SAMPLES:
    print("üéâ Already completed! All samples generated.")
else:
    print(f"üöÄ Generating {TOTAL_SAMPLES - start_index} new samples...")
    print(f"üíæ Checkpoints every {CHECKPOINT_EVERY} samples")
    print("="*50)

    # Generate remaining samples
    for i, q in enumerate(tqdm(questions[start_index:start_index + TOTAL_SAMPLES - start_index], 
                              desc="Processing questions", unit="question")):
        # Use improved prompt for better accuracy
        entry = generate_cot_improved(q, "calculation_focused")
        dataset.append(entry)
        
        # Checkpoint saving with Windows-compatible atomic write
        if len(dataset) % CHECKPOINT_EVERY == 0:
            temp_file = f"temp_{CHECKPOINT_FILE}"
            with open(temp_file, "w") as f:
                json.dump(dataset, f, indent=2)
            
            # Windows-compatible atomic write
            if os.path.exists(CHECKPOINT_FILE):
                os.remove(CHECKPOINT_FILE)  # Remove existing file first on Windows
            os.rename(temp_file, CHECKPOINT_FILE)
            print(f"üíæ Checkpoint: {len(dataset)}/{TOTAL_SAMPLES} samples saved")
        
        time.sleep(4)  # Rate limit delay

    # Final save
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump(dataset, f, indent=2)
    
    print(f"‚úÖ Generation complete! {len(dataset)} samples saved to {CHECKPOINT_FILE}")

    # Create final numbered file
    final_file = f"cot_improved_gsm8k_final_{len(dataset)}.json"
    with open(final_file, "w") as f:
        json.dump(dataset, f, indent=2)
    print(f"üèÅ Final dataset: {final_file}")

üìÅ Resuming from checkpoint: 2000 samples already completed
üìä Progress: 2000/3000 (66.7%)
üöÄ Generating 1000 new samples...
üíæ Checkpoints every 20 samples


Processing questions:   2%|‚ñè         | 19/1000 [02:00<1:40:59,  6.18s/question]

üíæ Checkpoint: 2020/3000 samples saved


Processing questions:   4%|‚ñç         | 39/1000 [04:06<1:39:41,  6.22s/question]

üíæ Checkpoint: 2040/3000 samples saved


Processing questions:   6%|‚ñå         | 59/1000 [06:18<2:05:21,  7.99s/question]

üíæ Checkpoint: 2060/3000 samples saved


Processing questions:   8%|‚ñä         | 79/1000 [08:25<1:37:29,  6.35s/question]

üíæ Checkpoint: 2080/3000 samples saved


Processing questions:  10%|‚ñâ         | 99/1000 [10:29<1:35:26,  6.36s/question]

üíæ Checkpoint: 2100/3000 samples saved


Processing questions:  12%|‚ñà‚ñè        | 119/1000 [12:37<1:39:20,  6.77s/question]

üíæ Checkpoint: 2120/3000 samples saved


Processing questions:  14%|‚ñà‚ñç        | 139/1000 [14:45<1:25:18,  5.95s/question]

üíæ Checkpoint: 2140/3000 samples saved


Processing questions:  16%|‚ñà‚ñå        | 159/1000 [16:49<1:23:17,  5.94s/question]

üíæ Checkpoint: 2160/3000 samples saved


Processing questions:  18%|‚ñà‚ñä        | 179/1000 [19:03<1:45:55,  7.74s/question]

üíæ Checkpoint: 2180/3000 samples saved


Processing questions:  20%|‚ñà‚ñâ        | 199/1000 [21:12<1:25:19,  6.39s/question]

üíæ Checkpoint: 2200/3000 samples saved


Processing questions:  22%|‚ñà‚ñà‚ñè       | 219/1000 [23:17<1:21:26,  6.26s/question]

üíæ Checkpoint: 2220/3000 samples saved


Processing questions:  24%|‚ñà‚ñà‚ñç       | 239/1000 [25:22<1:18:21,  6.18s/question]

üíæ Checkpoint: 2240/3000 samples saved


Processing questions:  26%|‚ñà‚ñà‚ñå       | 259/1000 [27:28<1:16:22,  6.18s/question]

üíæ Checkpoint: 2260/3000 samples saved


Processing questions:  28%|‚ñà‚ñà‚ñä       | 279/1000 [29:40<1:23:12,  6.92s/question]

üíæ Checkpoint: 2280/3000 samples saved


Processing questions:  30%|‚ñà‚ñà‚ñâ       | 299/1000 [31:46<1:12:43,  6.23s/question]

üíæ Checkpoint: 2300/3000 samples saved


Processing questions:  32%|‚ñà‚ñà‚ñà‚ñè      | 319/1000 [33:51<1:12:31,  6.39s/question]

üíæ Checkpoint: 2320/3000 samples saved


Processing questions:  34%|‚ñà‚ñà‚ñà‚ñç      | 339/1000 [35:58<1:11:28,  6.49s/question]

üíæ Checkpoint: 2340/3000 samples saved


Processing questions:  36%|‚ñà‚ñà‚ñà‚ñå      | 359/1000 [38:05<1:05:22,  6.12s/question]

üíæ Checkpoint: 2360/3000 samples saved


Processing questions:  38%|‚ñà‚ñà‚ñà‚ñä      | 379/1000 [40:18<1:09:06,  6.68s/question]

üíæ Checkpoint: 2380/3000 samples saved


Processing questions:  40%|‚ñà‚ñà‚ñà‚ñâ      | 399/1000 [42:26<1:03:09,  6.31s/question]

üíæ Checkpoint: 2400/3000 samples saved


Processing questions:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 419/1000 [44:44<58:36,  6.05s/question]  

üíæ Checkpoint: 2420/3000 samples saved


Processing questions:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 439/1000 [46:47<56:29,  6.04s/question]  

üíæ Checkpoint: 2440/3000 samples saved


Processing questions:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 459/1000 [48:52<57:47,  6.41s/question]  

üíæ Checkpoint: 2460/3000 samples saved


Processing questions:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 479/1000 [50:56<53:24,  6.15s/question]

üíæ Checkpoint: 2480/3000 samples saved


Processing questions:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 499/1000 [53:02<52:04,  6.24s/question]

üíæ Checkpoint: 2500/3000 samples saved


Processing questions:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 519/1000 [55:13<51:13,  6.39s/question]  

üíæ Checkpoint: 2520/3000 samples saved


Processing questions:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 539/1000 [57:20<47:43,  6.21s/question]

üíæ Checkpoint: 2540/3000 samples saved


Processing questions:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 559/1000 [59:22<45:18,  6.16s/question]

üíæ Checkpoint: 2560/3000 samples saved


Processing questions:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 579/1000 [1:01:25<41:20,  5.89s/question]

üíæ Checkpoint: 2580/3000 samples saved


Processing questions:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 599/1000 [1:03:30<41:24,  6.19s/question]

üíæ Checkpoint: 2600/3000 samples saved


Processing questions:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 619/1000 [1:05:37<39:22,  6.20s/question]

üíæ Checkpoint: 2620/3000 samples saved


Processing questions:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 639/1000 [1:07:46<38:34,  6.41s/question]

üíæ Checkpoint: 2640/3000 samples saved


Processing questions:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 659/1000 [1:09:52<35:47,  6.30s/question]

üíæ Checkpoint: 2660/3000 samples saved


Processing questions:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 679/1000 [1:12:10<34:40,  6.48s/question]  

üíæ Checkpoint: 2680/3000 samples saved


Processing questions:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 699/1000 [1:14:16<30:58,  6.17s/question]

üíæ Checkpoint: 2700/3000 samples saved


Processing questions:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 719/1000 [1:16:20<29:38,  6.33s/question]

üíæ Checkpoint: 2720/3000 samples saved


Processing questions:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 739/1000 [1:18:26<26:52,  6.18s/question]

üíæ Checkpoint: 2740/3000 samples saved


Processing questions:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 759/1000 [1:20:37<25:17,  6.30s/question]

üíæ Checkpoint: 2760/3000 samples saved


Processing questions:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 779/1000 [1:22:44<22:58,  6.24s/question]

üíæ Checkpoint: 2780/3000 samples saved


Processing questions:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 799/1000 [1:24:51<22:31,  6.72s/question]

üíæ Checkpoint: 2800/3000 samples saved


Processing questions:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 819/1000 [1:26:54<18:39,  6.19s/question]

üíæ Checkpoint: 2820/3000 samples saved


Processing questions:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 839/1000 [1:29:04<17:11,  6.41s/question]

üíæ Checkpoint: 2840/3000 samples saved


Processing questions:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 859/1000 [1:31:09<15:03,  6.41s/question]

üíæ Checkpoint: 2860/3000 samples saved


Processing questions:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 879/1000 [1:33:12<12:38,  6.27s/question]

üíæ Checkpoint: 2880/3000 samples saved


Processing questions:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 899/1000 [1:35:19<10:30,  6.24s/question]

üíæ Checkpoint: 2900/3000 samples saved


Processing questions:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 919/1000 [1:37:30<08:26,  6.25s/question]

üíæ Checkpoint: 2920/3000 samples saved


Processing questions:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 939/1000 [1:39:38<06:39,  6.55s/question]

üíæ Checkpoint: 2940/3000 samples saved


Processing questions:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 959/1000 [1:41:45<04:12,  6.17s/question]

üíæ Checkpoint: 2960/3000 samples saved


Processing questions:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 979/1000 [1:43:50<02:09,  6.16s/question]

üíæ Checkpoint: 2980/3000 samples saved


Processing questions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 995/1000 [1:45:32<00:30,  6.18s/question]

Error generating content for question: Amiyah is cutting some trees to build a cow shade. For every tree she cuts, she plants 5 new trees. If there were 400 trees on her farm and she cut 20% of them, calculate the total number of trees on the farm.
Error details: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 1000
Please retry in 26.784657949s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 1000
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.googl

Processing questions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 999/1000 [1:45:55<00:06,  6.02s/question]

üíæ Checkpoint: 3000/3000 samples saved


Processing questions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [1:46:01<00:00,  6.36s/question]



‚úÖ Generation complete! 3000 samples saved to cot_improved_gsm8k_checkpoint.json
üèÅ Final dataset: cot_improved_gsm8k_final_3000.json
üèÅ Final dataset: cot_improved_gsm8k_final_3000.json


# üìã Checkpoint Configuration Examples

## Example 1: Generate 500 samples, save every 20
```python
TOTAL_SAMPLES = 500      # Your target
CHECKPOINT_EVERY = 20    # Save progress every 20 samples
```

**What happens:**
- **Day 1**: Generates samples 1-500, saves every 20
- **Day 2**: If you run again (same settings) ‚Üí "üéâ Already completed!"
- **Day 2**: If you change to `TOTAL_SAMPLES = 1000` ‚Üí Resumes from 501-1000

## Example 2: Large dataset over multiple days
```python
TOTAL_SAMPLES = 3000     # Large target
CHECKPOINT_EVERY = 50    # Save every 50 samples
```

**What happens:**
- **Day 1**: Generates 1-950 (rate limit), saves every 50
- **Day 2**: Automatically resumes from 951-1900
- **Day 3**: Automatically resumes from 1901-2850
- **Day 4**: Finishes 2851-3000

## üîë Key Points:
- ‚úÖ **Automatic resume**: Just run the same cell again
- ‚úÖ **Safe interruption**: Can stop/start anytime
- ‚úÖ **Progress tracking**: Shows exactly where you are
- ‚úÖ **No data loss**: Atomic saves prevent corruption

In [15]:
# The function `list_datasets` is not available in the current version of the `datasets` library.
# Instead, you can use the CLI: `datasets-cli list` or browse https://huggingface.co/datasets
# To check if "allenai/svamp" exists, just try to load it directly:

try:
    svamp_ds = load_dataset("allenai/svamp", split="train")
    print("Loaded SVAMP dataset successfully!")
    print(svamp_ds[0])
except Exception as e:
    print("Could not load 'allenai/svamp':", e)
    print("Visit https://huggingface.co/datasets?search=svamp for available datasets.")

Could not load 'allenai/svamp': Dataset 'allenai/svamp' doesn't exist on the Hub or cannot be accessed.
Visit https://huggingface.co/datasets?search=svamp for available datasets.


ImportError: cannot import name 'list_datasets' from 'datasets' (c:\Users\nooba\anaconda3\Lib\site-packages\datasets\__init__.py)

In [20]:
print(f"SVAMP dataset has {len(svamp_ds)} instances.")

SVAMP dataset has 700 instances.


In [None]:
from datasets import load_dataset

# Load the SVAMP dataset - first check what splits are available
svamp_full = load_dataset("ChilleD/SVAMP")
print(f'Available splits: {list(svamp_full.keys())}')

# Load the train split specifically
svamp_ds = load_dataset("ChilleD/SVAMP", split="train")

print('\nüìä ChilleD/SVAMP DATASET ANALYSIS:')
print('='*50)
print(f'Total samples (train): {len(svamp_ds)}')
print(f'Sample keys: {list(svamp_ds[0].keys())}')

# Show a few examples to understand the format
print(f'\nüîç SAMPLE EXAMPLES:')
for i in range(3):
    sample = svamp_ds[i]
    print(f'\nExample {i+1}:')
    print(f'  ID: {sample["ID"]}')
    print(f'  Body: {sample["Body"]}')
    print(f'  Question: {sample["Question"]}')  
    print(f'  Equation: {sample["Equation"]}')
    print(f'  Answer: {sample["Answer"]}')
    print(f'  Type: {sample["Type"]}')

print(f'\n‚úÖ RECOMMENDATION: ChilleD/SVAMP appears to be the correct dataset!')
print(f'   ‚Ä¢ Has exactly {len(svamp_ds)} samples (matches original SVAMP paper)')
print(f'   ‚Ä¢ Contains all expected fields: Body, Question, Equation, Answer')
print(f'   ‚Ä¢ Includes problem types and concatenated questions')
print(f'   ‚Ä¢ This is likely a properly formatted version of the original dataset')

KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['test', 'train']"

In [22]:
# Analyze the SVAMP dataset that's already loaded
print('üìä SVAMP DATASET ANALYSIS:')
print('='*50)
print(f'Available splits: {list(svamp_ds.keys())}')

# Get train split
train_data = svamp_ds['train']
test_data = svamp_ds['test'] if 'test' in svamp_ds else None

print(f'Train samples: {len(train_data)}')
if test_data:
    print(f'Test samples: {len(test_data)}')

print(f'Sample keys: {list(train_data[0].keys())}')

# Show a few examples to understand the format
print(f'\nüîç SAMPLE EXAMPLES:')
for i in range(3):
    sample = train_data[i]
    print(f'\nExample {i+1}:')
    print(f'  ID: {sample["ID"]}')
    print(f'  Body: {sample["Body"]}')
    print(f'  Question: {sample["Question"]}')  
    print(f'  Equation: {sample["Equation"]}')
    print(f'  Answer: {sample["Answer"]}')
    print(f'  Type: {sample["Type"]}')

print(f'\n‚úÖ RECOMMENDATION: ChilleD/SVAMP is the correct dataset!')
print(f'   ‚Ä¢ Original SVAMP paper has 700 training samples ‚úì')
print(f'   ‚Ä¢ Contains all expected fields: Body, Question, Equation, Answer ‚úì')  
print(f'   ‚Ä¢ Includes problem types and concatenated questions ‚úì')
print(f'   ‚Ä¢ This is the properly formatted version of the original dataset ‚úì')
print(f'\nüìö ABOUT SVAMP:')
print(f'   ‚Ä¢ Paper: "Are NLP Models really able to Solve Simple Math Word Problems?"')
print(f'   ‚Ä¢ Authors: Arkil Patel, Satwik Bhattamishra, Navin Goyal')
print(f'   ‚Ä¢ Focus: Simple Variational problems in Arithmetic Math Problems')
print(f'   ‚Ä¢ Purpose: Test reasoning capabilities with simple math variations')

üìä SVAMP DATASET ANALYSIS:
Available splits: ['train', 'test']
Train samples: 700
Test samples: 300
Sample keys: ['ID', 'Body', 'Question', 'Equation', 'Answer', 'Type', 'question_concat']

üîç SAMPLE EXAMPLES:

Example 1:
  ID: chal-777
  Body: There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups
  Question: How big is each group of bananas?
  Equation: ( 290.0 / 2.0 )
  Answer: 145
  Type: Common-Division

Example 2:
  ID: chal-508
  Body: Marco and his dad went strawberry picking. Marco's dad's strawberries weighed 11 pounds. If together their strawberries weighed 30 pounds.
  Question: How much did Marco's strawberries weigh?
  Equation: ( 30.0 - 11.0 )
  Answer: 19
  Type: Subtraction

Example 3:
  ID: chal-896
  Body: Edward spent $ 6 to buy 2 books each book costing him the same amount of money. Now he has $ 12.
  Question: How much did each book cost?
  Equation: ( 6.0 / 2.0 )
  Answe

In [23]:
# SVAMP CoT Generation Function - Adapted for Body + Question format
def generate_cot_svamp(sample, prompt_type="context_aware"):
    """
    Enhanced CoT generation specifically designed for SVAMP dataset
    
    SVAMP has separate 'Body' (context) and 'Question' fields, unlike GSM8K's single question.
    This requires different prompting strategies to leverage the context effectively.
    """
    
    if prompt_type == "context_aware":
        # RECOMMENDED: Explicitly separates context and question for better understanding
        prompt = f"""I need to solve this math word problem by carefully understanding the context and question.

Context: {sample['Body']}
Question: {sample['Question']}

I will:
- Understand the given context and what information it provides
- Identify the specific question being asked
- Extract relevant numbers and relationships
- Determine the mathematical operation needed
- Calculate step by step
- Provide a clear final answer

Let me solve this step by step:"""

    elif prompt_type == "story_focused":
        # Alternative: Treats it as a complete story problem
        prompt = f"""Let me solve this math story problem carefully.

Story: {sample['Body']} {sample['Question']}

I need to:
1. Read and understand the complete story
2. Identify what I'm solving for
3. Find the important numbers and relationships
4. Choose the right mathematical operation
5. Calculate the answer step by step

Solution:"""

    elif prompt_type == "structured_reasoning":
        # Alternative: Highly structured approach
        prompt = f"""Math Problem Analysis:

**Context:** {sample['Body']}
**Question:** {sample['Question']}

**Step-by-step reasoning:**
1. **Understanding:** What is the situation described?
2. **Given information:** What numbers and facts do I have?
3. **Target:** What exactly am I trying to find?
4. **Operation:** What mathematical operation will solve this?
5. **Calculation:** Let me compute the answer
6. **Verification:** Does my answer make sense in context?

**Solution:**"""

    else:  # default - simple approach
        prompt = f"Context: {sample['Body']}\nQuestion: {sample['Question']}\n\nLet me solve this step by step:"
    
    try:
        response = model.generate_content(prompt)
        trace = response.text
        
        # Use the same improved regex patterns from GSM8K function
        # (SVAMP answers are also numerical, so same extraction logic applies)
        
        # Method 1: Last line with number (most reliable)
        lines = trace.split('\n')
        last_line_number = None
        for line in reversed(lines):
            # FIXED regex: Prioritize longer numbers first, then comma-separated numbers
            numbers = re.findall(r'(\d+(?:,\d{3})*|\d{1,3}(?:,\d{3})+)', line)
            if not numbers:
                # Fallback: capture any sequence of digits
                numbers = re.findall(r'(\d+)', line)
            
            if numbers:
                # Remove commas and convert to clean numbers for comparison
                clean_numbers = [num.replace(',', '') for num in numbers]
                
                # IMPROVED SELECTION LOGIC: Prioritize the best number
                if clean_numbers:
                    # 1. If line contains final answer indicators, prioritize the largest number
                    if any(indicator in line.lower() for indicator in ['final', 'answer', 'total', 'altogether', '####']):
                        last_line_number = max(clean_numbers, key=lambda x: int(x))
                    else:
                        # 2. For non-final lines, prioritize numbers >= 3 digits, then largest
                        large_numbers = [n for n in clean_numbers if len(n) >= 3]
                        if large_numbers:
                            last_line_number = max(large_numbers, key=lambda x: int(x))
                        else:
                            last_line_number = max(clean_numbers, key=lambda x: int(x))
                break
        
        # Method 2: Structured patterns as fallback (same as GSM8K)
        patterns = [
            r"####\s*(\d+)",                                                             
            r"\$(\d+(?:\.\d{2})?)\b",                                                   
            r"total.*?\$?(\d+(?:\.\d{2})?)",                                             
            r"final answer.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                              
            r"Answer:\s*.*?\*\*\$?(\d+(?:\.\d{2})?)\*\*",                                
            r"Final answer:\s*.*?\$?(\d+(?:\.\d{2})?)",                                  
            r"Answer:\s*.*?\$?(\d+(?:\.\d{2})?)",                                        
            r"The answer is\s*\$?(\d+(?:\.\d{2})?)",                                     
            r"\*\*(\d+(?:\.\d{2})?)\*\*\s*(?:clips?|flowers?|pages?|pounds?|dollars?|pieces?|total|people|items?|sq\.?\s*ft\.?)",
            r"(\d+(?:\.\d{2})?)\s+(?:pounds?|dollars?|people|items?|total|left|altogether|sq\.?\s*ft\.?)",
            r"=\s*\$?(\d+(?:\.\d{2})?)(?:\s*\.|\s*$)",                                   
        ]
        
        # Try last line method first
        ans_match = None
        if last_line_number:
            class MockMatch:
                def __init__(self, value):
                    self._value = value
                def group(self, n):
                    return self._value
            ans_match = MockMatch(last_line_number)
        else:
            # Fallback to pattern matching
            for pattern in patterns:
                ans_match = re.search(pattern, trace, re.IGNORECASE)
                if ans_match:
                    break
        
        return {
            "id": sample['ID'],
            "body": sample['Body'],
            "question": sample['Question'],
            "cot": trace,
            "ans": ans_match.group(1).strip().replace(',', '').split('.')[0] if ans_match else str(sample["Answer"]),
            "gold": str(sample["Answer"]),
            "equation": sample['Equation'],
            "type": sample['Type'],
            "domain": "svamp",
            "prompt_type": prompt_type
        }
    except Exception as e:
        print(f"Error generating content for SVAMP problem: {sample['ID']}")
        print(f"Error details: {e}")
        return {
            "id": sample['ID'],
            "body": sample['Body'],
            "question": sample['Question'],
            "cot": f"Error: {e}",
            "ans": str(sample["Answer"]),
            "gold": str(sample["Answer"]),
            "equation": sample['Equation'],
            "type": sample['Type'],
            "domain": "svamp",
            "prompt_type": prompt_type
        }

print("‚úÖ SVAMP CoT generation function created!")
print("\nüìã AVAILABLE PROMPT TYPES:")
print("1. 'context_aware' (RECOMMENDED) - Explicitly separates context and question")
print("2. 'story_focused' - Treats as complete story problem")
print("3. 'structured_reasoning' - Highly structured 6-step approach")
print("4. 'default' - Simple concatenated approach")

print("\nüéØ RECOMMENDED CHOICE: 'context_aware'")
print("   REASON: SVAMP problems have distinct 'Body' (context) and 'Question' parts.")
print("   This prompt type explicitly guides the model to:")
print("   ‚Ä¢ First understand the context/situation")
print("   ‚Ä¢ Then focus on the specific question")
print("   ‚Ä¢ This mirrors how humans solve SVAMP problems naturally")
print("   ‚Ä¢ Better performance expected vs. simple concatenation")

‚úÖ SVAMP CoT generation function created!

üìã AVAILABLE PROMPT TYPES:
1. 'context_aware' (RECOMMENDED) - Explicitly separates context and question
2. 'story_focused' - Treats as complete story problem
3. 'structured_reasoning' - Highly structured 6-step approach
4. 'default' - Simple concatenated approach

üéØ RECOMMENDED CHOICE: 'context_aware'
   REASON: SVAMP problems have distinct 'Body' (context) and 'Question' parts.
   This prompt type explicitly guides the model to:
   ‚Ä¢ First understand the context/situation
   ‚Ä¢ Then focus on the specific question
   ‚Ä¢ This mirrors how humans solve SVAMP problems naturally
   ‚Ä¢ Better performance expected vs. simple concatenation


In [24]:
# SVAMP DATASET GENERATION WITH CHECKPOINTING
print("üîÑ PREPARING SVAMP DATASET FOR COT GENERATION...")

# Prepare SVAMP questions (use train split)
svamp_train = svamp_ds['train']
svamp_questions = []

for sample in svamp_train:
    svamp_questions.append({
        'ID': sample['ID'],
        'Body': sample['Body'], 
        'Question': sample['Question'],
        'Answer': sample['Answer'],
        'Equation': sample['Equation'],
        'Type': sample['Type']
    })

print(f"üìä SVAMP Dataset prepared: {len(svamp_questions)} problems")
print(f"   Sample problem types: {set([q['Type'] for q in svamp_questions[:10]])}")

# SVAMP GENERATION CONFIGURATION
SVAMP_TOTAL_SAMPLES = 700  # All SVAMP training samples (700 total)
SVAMP_CHECKPOINT_EVERY = 20  # Save progress every N samples
SVAMP_CHECKPOINT_FILE = "cot_svamp_checkpoint.json"

# Load existing progress if any
svamp_dataset = []
svamp_start_index = 0

if os.path.exists(SVAMP_CHECKPOINT_FILE):
    try:
        with open(SVAMP_CHECKPOINT_FILE, 'r') as f:
            svamp_dataset = json.load(f)
        svamp_start_index = len(svamp_dataset)
        print(f"üìÅ Resuming SVAMP from checkpoint: {svamp_start_index} samples already completed")
        print(f"üìä Progress: {svamp_start_index}/{SVAMP_TOTAL_SAMPLES} ({svamp_start_index/SVAMP_TOTAL_SAMPLES*100:.1f}%)")
    except Exception as e:
        print(f"‚ùå Error loading SVAMP checkpoint: {e}")
        print("üîÑ Starting fresh...")
        svamp_dataset = []
        svamp_start_index = 0

print(f"\nüöÄ READY TO GENERATE SVAMP CoT DATA:")
print(f"   ‚Ä¢ Total samples: {SVAMP_TOTAL_SAMPLES}")
print(f"   ‚Ä¢ Remaining: {SVAMP_TOTAL_SAMPLES - svamp_start_index}")
print(f"   ‚Ä¢ Checkpoint every: {SVAMP_CHECKPOINT_EVERY} samples")
print(f"   ‚Ä¢ Recommended prompt: 'context_aware'")
print(f"   ‚Ä¢ Rate limit: 4 seconds between requests")
print("="*60)

üîÑ PREPARING SVAMP DATASET FOR COT GENERATION...
üìä SVAMP Dataset prepared: 700 problems
   Sample problem types: {'Addition', 'Subtraction', 'Common-Division'}

üöÄ READY TO GENERATE SVAMP CoT DATA:
   ‚Ä¢ Total samples: 700
   ‚Ä¢ Remaining: 700
   ‚Ä¢ Checkpoint every: 20 samples
   ‚Ä¢ Recommended prompt: 'context_aware'
   ‚Ä¢ Rate limit: 4 seconds between requests


In [32]:
# SVAMP COT GENERATION LOOP
if svamp_start_index >= SVAMP_TOTAL_SAMPLES:
    print("üéâ SVAMP CoT generation already completed! All samples generated.")
else:
    print(f"üöÄ Generating {SVAMP_TOTAL_SAMPLES - svamp_start_index} new SVAMP CoT samples...")
    print(f"üíæ Checkpoints every {SVAMP_CHECKPOINT_EVERY} samples")
    print("="*50)

    # Generate remaining samples
    for i, sample in enumerate(tqdm(svamp_questions[svamp_start_index:svamp_start_index + SVAMP_TOTAL_SAMPLES - svamp_start_index], 
                                  desc="Processing SVAMP problems", unit="problem")):
        
        # Use context_aware prompt for better SVAMP understanding
        entry = generate_cot_svamp(sample, "context_aware")
        svamp_dataset.append(entry)
        
        # Checkpoint saving with Windows-compatible atomic write
        if len(svamp_dataset) % SVAMP_CHECKPOINT_EVERY == 0:
            temp_file = f"temp_{SVAMP_CHECKPOINT_FILE}"
            with open(temp_file, "w") as f:
                json.dump(svamp_dataset, f, indent=2)
            
            # Windows-compatible atomic write
            if os.path.exists(SVAMP_CHECKPOINT_FILE):
                os.remove(SVAMP_CHECKPOINT_FILE)
            os.rename(temp_file, SVAMP_CHECKPOINT_FILE)
            print(f"üíæ SVAMP Checkpoint: {len(svamp_dataset)}/{SVAMP_TOTAL_SAMPLES} samples saved")
        
        time.sleep(4)  # Rate limit delay

    # Final save
    with open(SVAMP_CHECKPOINT_FILE, "w") as f:
        json.dump(svamp_dataset, f, indent=2)
    
    print(f"‚úÖ SVAMP Generation complete! {len(svamp_dataset)} samples saved to {SVAMP_CHECKPOINT_FILE}")

    # Create final numbered file
    svamp_final_file = f"cot_svamp_final_{len(svamp_dataset)}.json"
    with open(svamp_final_file, "w") as f:
        json.dump(svamp_dataset, f, indent=2)
    print(f"üèÅ Final SVAMP dataset: {svamp_final_file}")
    
    # Show sample of generated data
    if svamp_dataset:
        print(f"\nüìã SAMPLE GENERATED SVAMP COT:")
        sample_entry = svamp_dataset[0]
        print(f"   ID: {sample_entry['id']}")
        print(f"   Body: {sample_entry['body'][:80]}...")
        print(f"   Question: {sample_entry['question']}")
        print(f"   Generated Answer: {sample_entry['ans']}")
        print(f"   Gold Answer: {sample_entry['gold']}")
        print(f"   Problem Type: {sample_entry['type']}")
        print(f"   CoT Length: {len(sample_entry['cot'])} characters")

üöÄ Generating 700 new SVAMP CoT samples...
üíæ Checkpoints every 20 samples


Processing SVAMP problems:   3%|‚ñé         | 19/700 [01:46<1:06:20,  5.85s/problem]

üíæ SVAMP Checkpoint: 20/700 samples saved


Processing SVAMP problems:   6%|‚ñå         | 39/700 [03:50<1:04:25,  5.85s/problem]

üíæ SVAMP Checkpoint: 40/700 samples saved


Processing SVAMP problems:   8%|‚ñä         | 59/700 [05:40<1:01:16,  5.74s/problem]

üíæ SVAMP Checkpoint: 60/700 samples saved


Processing SVAMP problems:  11%|‚ñà‚ñè        | 79/700 [07:31<56:41,  5.48s/problem]  

üíæ SVAMP Checkpoint: 80/700 samples saved


Processing SVAMP problems:  14%|‚ñà‚ñç        | 99/700 [09:26<56:24,  5.63s/problem]  

üíæ SVAMP Checkpoint: 100/700 samples saved


Processing SVAMP problems:  17%|‚ñà‚ñã        | 119/700 [11:18<53:18,  5.51s/problem]

üíæ SVAMP Checkpoint: 120/700 samples saved


Processing SVAMP problems:  20%|‚ñà‚ñâ        | 139/700 [13:14<1:08:34,  7.33s/problem]

üíæ SVAMP Checkpoint: 140/700 samples saved


Processing SVAMP problems:  23%|‚ñà‚ñà‚ñé       | 159/700 [15:08<50:45,  5.63s/problem]  

üíæ SVAMP Checkpoint: 160/700 samples saved


Processing SVAMP problems:  26%|‚ñà‚ñà‚ñå       | 179/700 [16:59<48:04,  5.54s/problem]

üíæ SVAMP Checkpoint: 180/700 samples saved


Processing SVAMP problems:  28%|‚ñà‚ñà‚ñä       | 199/700 [18:54<48:37,  5.82s/problem]

üíæ SVAMP Checkpoint: 200/700 samples saved


Processing SVAMP problems:  31%|‚ñà‚ñà‚ñà‚ñè      | 219/700 [21:03<57:47,  7.21s/problem]  

üíæ SVAMP Checkpoint: 220/700 samples saved


Processing SVAMP problems:  34%|‚ñà‚ñà‚ñà‚ñç      | 239/700 [22:55<44:09,  5.75s/problem]

üíæ SVAMP Checkpoint: 240/700 samples saved


Processing SVAMP problems:  37%|‚ñà‚ñà‚ñà‚ñã      | 259/700 [25:45<2:38:42, 21.59s/problem]

üíæ SVAMP Checkpoint: 260/700 samples saved


Processing SVAMP problems:  40%|‚ñà‚ñà‚ñà‚ñâ      | 279/700 [27:41<41:30,  5.92s/problem]  

üíæ SVAMP Checkpoint: 280/700 samples saved


Processing SVAMP problems:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 299/700 [29:33<38:09,  5.71s/problem]

üíæ SVAMP Checkpoint: 300/700 samples saved


Processing SVAMP problems:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 319/700 [31:25<36:08,  5.69s/problem]

üíæ SVAMP Checkpoint: 320/700 samples saved


Processing SVAMP problems:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 339/700 [33:23<33:23,  5.55s/problem]

üíæ SVAMP Checkpoint: 340/700 samples saved


Processing SVAMP problems:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 359/700 [35:18<33:36,  5.91s/problem]

üíæ SVAMP Checkpoint: 360/700 samples saved


Processing SVAMP problems:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 379/700 [37:11<30:48,  5.76s/problem]

üíæ SVAMP Checkpoint: 380/700 samples saved


Processing SVAMP problems:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 399/700 [39:04<28:45,  5.73s/problem]

üíæ SVAMP Checkpoint: 400/700 samples saved


Processing SVAMP problems:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 419/700 [41:02<27:47,  5.93s/problem]

üíæ SVAMP Checkpoint: 420/700 samples saved


Processing SVAMP problems:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 439/700 [43:01<24:19,  5.59s/problem]

üíæ SVAMP Checkpoint: 440/700 samples saved


Processing SVAMP problems:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 459/700 [45:02<24:16,  6.04s/problem]

üíæ SVAMP Checkpoint: 460/700 samples saved


Processing SVAMP problems:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 479/700 [46:51<20:51,  5.67s/problem]

üíæ SVAMP Checkpoint: 480/700 samples saved


Processing SVAMP problems:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 499/700 [48:44<18:26,  5.50s/problem]

üíæ SVAMP Checkpoint: 500/700 samples saved


Processing SVAMP problems:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 519/700 [50:43<17:10,  5.69s/problem]

üíæ SVAMP Checkpoint: 520/700 samples saved


Processing SVAMP problems:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 539/700 [52:46<15:17,  5.70s/problem]

üíæ SVAMP Checkpoint: 540/700 samples saved


Processing SVAMP problems:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 559/700 [54:43<13:08,  5.60s/problem]

üíæ SVAMP Checkpoint: 560/700 samples saved


Processing SVAMP problems:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 579/700 [56:35<11:43,  5.81s/problem]

üíæ SVAMP Checkpoint: 580/700 samples saved


Processing SVAMP problems:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 599/700 [58:32<09:58,  5.93s/problem]

üíæ SVAMP Checkpoint: 600/700 samples saved


Processing SVAMP problems:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 619/700 [1:00:23<07:42,  5.71s/problem]

üíæ SVAMP Checkpoint: 620/700 samples saved


Processing SVAMP problems:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 639/700 [1:02:18<05:58,  5.88s/problem]

üíæ SVAMP Checkpoint: 640/700 samples saved


Processing SVAMP problems:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 659/700 [1:06:45<05:33,  8.14s/problem]

üíæ SVAMP Checkpoint: 660/700 samples saved


Processing SVAMP problems:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 679/700 [1:08:41<01:57,  5.60s/problem]

üíæ SVAMP Checkpoint: 680/700 samples saved


Processing SVAMP problems: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 699/700 [1:10:34<00:05,  5.52s/problem]

üíæ SVAMP Checkpoint: 700/700 samples saved


Processing SVAMP problems: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 700/700 [1:10:42<00:00,  6.06s/problem]

‚úÖ SVAMP Generation complete! 700 samples saved to cot_svamp_checkpoint.json
üèÅ Final SVAMP dataset: cot_svamp_final_700.json

üìã SAMPLE GENERATED SVAMP COT:
   ID: chal-777
   Body: There are 87 oranges and 290 bananas in Philip's collection. If the bananas are ...
   Question: How big is each group of bananas?
   Generated Answer: 145
   Gold Answer: 145
   Problem Type: Common-Division
   CoT Length: 906 characters





In [26]:
# TEST: Demonstrate SVAMP CoT Generation
print("üß™ TESTING SVAMP CoT GENERATION:")
print("="*50)

# Test with first SVAMP problem
test_sample = svamp_questions[0]
print(f"üìã TEST PROBLEM:")
print(f"   ID: {test_sample['ID']}")
print(f"   Body: {test_sample['Body']}")
print(f"   Question: {test_sample['Question']}")
print(f"   Gold Answer: {test_sample['Answer']}")
print(f"   Type: {test_sample['Type']}")

print(f"\nü§ñ GENERATING CoT WITH 'context_aware' PROMPT...")
# Uncomment the line below to test (will use 1 API call)
result = generate_cot_svamp(test_sample, "context_aware")
print(f"Generated CoT: {result['cot'][:200]}...")
print(f"Extracted Answer: {result['ans']}")
print(f"Gold Answer: {result['gold']}")

print(f"\n‚ö° TO RUN TEST: Uncomment the lines above")
print(f"‚ö° TO START FULL GENERATION: Run the next cell")
print(f"üìä This will generate {SVAMP_TOTAL_SAMPLES} CoT samples for SVAMP dataset")
print("="*50)

üß™ TESTING SVAMP CoT GENERATION:
üìã TEST PROBLEM:
   ID: chal-777
   Body: There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups
   Question: How big is each group of bananas?
   Gold Answer: 145
   Type: Common-Division

ü§ñ GENERATING CoT WITH 'context_aware' PROMPT...
Generated CoT: You've got a great plan! Let's follow it to solve this problem.

**1. Understand the given context and what information it provides:**
*   Philip has 87 oranges.
*   Philip has 290 bananas.
*   The ba...
Extracted Answer: 145
Gold Answer: 145

‚ö° TO RUN TEST: Uncomment the lines above
‚ö° TO START FULL GENERATION: Run the next cell
üìä This will generate 700 CoT samples for SVAMP dataset
Generated CoT: You've got a great plan! Let's follow it to solve this problem.

**1. Understand the given context and what information it provides:**
*   Philip has 87 oranges.
*   Philip has 290 bananas.
*   The ba...
Extra

In [28]:
# ANALYZE SVAMP EXTRACTION VULNERABILITIES
print("üîç SVAMP EXTRACTION VULNERABILITY ANALYSIS:")
print("="*60)

# Let's examine the full CoT output from our test
if 'result' in globals():
    print(f"üìã FULL CoT OUTPUT FOR ANALYSIS:")
    print(f"   Problem: {result['question']}")
    print(f"   Gold Answer: {result['gold']}")
    print(f"   Extracted Answer: {result['ans']}")
    print(f"   ‚úÖ Current Extraction: {'CORRECT' if result['ans'] == result['gold'] else 'INCORRECT'}")
    
    print(f"\nüîç COMPLETE CoT TRACE:")
    print("-" * 40)
    print(result['cot'])
    print("-" * 40)
    
    # Analyze potential vulnerability patterns
    cot_lines = result['cot'].split('\n')
    print(f"\nüö® POTENTIAL SVAMP EXTRACTION VULNERABILITIES:")
    
    # Look for numbers that could cause confusion
    all_numbers = []
    for line in cot_lines:
        numbers = re.findall(r'\d+', line)
        if numbers:
            all_numbers.extend([(line.strip(), numbers) for _ in [None]])
    
    print(f"   üìä Numbers found in CoT: {len(all_numbers)} lines with numbers")
    for line, nums in all_numbers[:5]:  # Show first 5 lines with numbers
        print(f"     Line: '{line[:80]}...' ‚Üí Numbers: {nums}")
    
    # Check for specific SVAMP vulnerability patterns
    vulnerability_checks = [
        ("üî¢ Multiple Choice Numbers", r'(\d+)\s*(?:groups?|items?|each)', "Groups/items confusion"),
        ("üìê Calculation Steps", r'(\d+)\s*[+\-*/]\s*(\d+)', "Intermediate calculations"),
        ("üí∞ Money Context", r'\$(\d+)', "Dollar amounts"),
        ("üìù Problem Numbers", r'(87|290|2|93)', "Original problem numbers"),
        ("üéØ Final Indicators", r'(?:answer|total|final).*?(\d+)', "Answer indicators")
    ]
    
    print(f"\nüîç SPECIFIC VULNERABILITY PATTERNS:")
    for name, pattern, desc in vulnerability_checks:
        matches = re.findall(pattern, result['cot'], re.IGNORECASE)
        if matches:
            print(f"   {name}: {len(matches)} matches - {desc}")
            print(f"     Examples: {matches[:3]}")
    
else:
    print("‚ùå No test result found. Run the test cell first to analyze vulnerabilities.")

print(f"\nüìö PREDICTED SVAMP VULNERABILITIES:")
print(f"   1. ‚ö†Ô∏è  PROBLEM CONTEXT NUMBERS: 87, 290, 2, 93 from original problem")
print(f"   2. ‚ö†Ô∏è  INTERMEDIATE DIVISIONS: 290√∑2, step-by-step calculations") 
print(f"   3. ‚ö†Ô∏è  GROUP SIZE vs TOTAL: Confusion between 'groups' and 'size'")
print(f"   4. ‚ö†Ô∏è  SVAMP SPECIFIC: Multiple numeric contexts in single problem")
print(f"   5. ‚ö†Ô∏è  UNIT CONFUSION: Items, groups, pieces, total - context matters")

print(f"\nüõ°Ô∏è  SVAMP EXTRACTION WILL NEED:")
print(f"   ‚Ä¢ Context-aware patterns (understand 'groups' vs 'each')")
print(f"   ‚Ä¢ Problem-type specific extraction (Division vs Addition vs Subtraction)")
print(f"   ‚Ä¢ Enhanced final answer detection")
print(f"   ‚Ä¢ Similar hybrid approach as GSM8K but adapted for SVAMP structure")
print("="*60)

üîç SVAMP EXTRACTION VULNERABILITY ANALYSIS:
üìã FULL CoT OUTPUT FOR ANALYSIS:
   Problem: How big is each group of bananas?
   Gold Answer: 145
   Extracted Answer: 145
   ‚úÖ Current Extraction: CORRECT

üîç COMPLETE CoT TRACE:
----------------------------------------
You've got a great plan! Let's follow it to solve this problem.

**1. Understand the given context and what information it provides:**
*   Philip has 87 oranges.
*   Philip has 290 bananas.
*   The bananas are organized into 2 groups.
*   The oranges are organized into 93 groups.

**2. Identify the specific question being asked:**
The question is: "How big is each group of bananas?"

**3. Extract relevant numbers and relationships:**
*   Total number of bananas: 290
*   Number of groups for bananas: 2
*   (Note: The information about oranges is extra and not needed to answer the question about bananas.)

**4. Determine the mathematical operation needed:**
To find out how big each group of bananas is, we need to divid

In [29]:
# FOCUSED SVAMP EXTRACTION VULNERABILITY ANALYSIS
print("üéØ SVAMP-SPECIFIC EXTRACTION CHALLENGES:")
print("="*50)

# Quick analysis of our test result
if 'result' in globals():
    # Count numbers in the CoT
    all_numbers = re.findall(r'\d+', result['cot'])
    unique_numbers = list(set(all_numbers))
    
    print(f"üìä QUICK STATS:")
    print(f"   Total numbers in CoT: {len(all_numbers)}")
    print(f"   Unique numbers: {len(unique_numbers)}")
    print(f"   Unique numbers: {sorted(unique_numbers, key=int)}")
    print(f"   Correct answer: {result['gold']}")
    print(f"   Current extraction: {result['ans']} ({'‚úÖ CORRECT' if result['ans'] == result['gold'] else '‚ùå WRONG'})")

print(f"\nüö® PREDICTED SVAMP EXTRACTION VULNERABILITIES:")

print(f"\n1. üìù PROBLEM CONTEXT CONTAMINATION:")
print(f"   ‚Ä¢ Original numbers: 87 oranges, 290 bananas, 2 groups, 93 groups")
print(f"   ‚Ä¢ Risk: Model might extract 87, 290, 2, or 93 instead of answer")
print(f"   ‚Ä¢ Example: 'Philip has 87 oranges' ‚Üí Extracts 87 instead of 145")

print(f"\n2. üî¢ DIVISION STEP CONFUSION:")
print(f"   ‚Ä¢ Calculation: 290 √∑ 2 = 145")
print(f"   ‚Ä¢ Risk: Could extract 290, 2, or intermediate steps")
print(f"   ‚Ä¢ Example: 'divide 290 by 2' ‚Üí Extracts 290 instead of 145")

print(f"\n3. üìê SVAMP-SPECIFIC PATTERNS:")
print(f"   ‚Ä¢ Multi-entity problems (oranges AND bananas)")
print(f"   ‚Ä¢ Group vs individual confusion")
print(f"   ‚Ä¢ Unit context matters ('each group' vs 'total groups')")

print(f"\n4. üéØ DIFFERENT FROM GSM8K:")
print(f"   ‚Ä¢ GSM8K: Usually single narrative, money context")
print(f"   ‚Ä¢ SVAMP: Multiple entities, unit/grouping focus")
print(f"   ‚Ä¢ Extraction needs to understand WHAT is being asked")

print(f"\nüí° RECOMMENDED SVAMP EXTRACTION STRATEGY:")
print(f"   1. üéØ Problem-type aware extraction (Division, Addition, Subtraction)")
print(f"   2. üîç Context filtering (avoid problem setup numbers)")
print(f"   3. üìä Enhanced final answer patterns")
print(f"   4. üõ°Ô∏è  SVAMP hybrid extraction (similar to GSM8K but adapted)")

print(f"\n‚úÖ CURRENT STATUS: Test shows correct extraction, but larger dataset will reveal edge cases")
print("="*50)

üéØ SVAMP-SPECIFIC EXTRACTION CHALLENGES:
üìä QUICK STATS:
   Total numbers in CoT: 20
   Unique numbers: 10
   Unique numbers: ['1', '2', '3', '4', '5', '6', '87', '93', '145', '290']
   Correct answer: 145
   Current extraction: 145 (‚úÖ CORRECT)

üö® PREDICTED SVAMP EXTRACTION VULNERABILITIES:

1. üìù PROBLEM CONTEXT CONTAMINATION:
   ‚Ä¢ Original numbers: 87 oranges, 290 bananas, 2 groups, 93 groups
   ‚Ä¢ Risk: Model might extract 87, 290, 2, or 93 instead of answer
   ‚Ä¢ Example: 'Philip has 87 oranges' ‚Üí Extracts 87 instead of 145

2. üî¢ DIVISION STEP CONFUSION:
   ‚Ä¢ Calculation: 290 √∑ 2 = 145
   ‚Ä¢ Risk: Could extract 290, 2, or intermediate steps
   ‚Ä¢ Example: 'divide 290 by 2' ‚Üí Extracts 290 instead of 145

3. üìê SVAMP-SPECIFIC PATTERNS:
   ‚Ä¢ Multi-entity problems (oranges AND bananas)
   ‚Ä¢ Group vs individual confusion
   ‚Ä¢ Unit context matters ('each group' vs 'total groups')

4. üéØ DIFFERENT FROM GSM8K:
   ‚Ä¢ GSM8K: Usually single narrative, mon

In [33]:
# TEMPLATE: SVAMP HYBRID EXTRACTION FIX STRATEGY
print("üìã SVAMP EXTRACTION FIX TEMPLATE:")
print("="*50)

def create_svamp_extraction_fix():
    """
    Template for SVAMP-specific extraction fixing script
    (Similar to GSM8K hybrid approach but adapted for SVAMP characteristics)
    """
    
    template_code = '''
# SVAMP HYBRID EXTRACTION FIX (Adapted from GSM8K approach)
import json
import re

def fix_svamp_extraction(input_file, output_file):
    """
    SVAMP-specific extraction fixing with context awareness
    """
    
    # Load SVAMP CoT dataset
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    fixed_count = 0
    svamp_method_stats = {"high_confidence": 0, "context_filtered": 0, "problem_type_aware": 0, "robust_fallback": 0}
    
    for entry in data:
        original_ans = entry['ans']
        cot = entry['cot']
        problem_type = entry['type']  # SVAMP has problem types!
        body = entry['body']
        question = entry['question']
        
        # SVAMP-SPECIFIC HIGH-CONFIDENCE PATTERNS
        high_confidence_patterns = [
            # Final answer with context
            r"final answer.*?(?:is|:)\s*(\d+)",
            r"answer.*?(?:is|:)\s*(\d+)",
            r"each group.*?(\d+)",  # SVAMP-specific: "each group has X"
            r"(\d+).*?(?:in each|per group|each group)",  # "X in each group"
            
            # Problem-type specific patterns
            r"total.*?(?:is|=)\s*(\d+)" if "Addition" in problem_type else None,
            r"(?:left|remaining).*?(\d+)" if "Subtraction" in problem_type else None,
            r"each.*?(?:group|item).*?(\d+)" if "Division" in problem_type else None,
        ]
        
        # Remove None patterns
        high_confidence_patterns = [p for p in high_confidence_patterns if p]
        
        new_ans = None
        method_used = None
        
        # Method 1: High-confidence SVAMP patterns
        for pattern in high_confidence_patterns:
            match = re.search(pattern, cot, re.IGNORECASE)
            if match:
                candidate = match.group(1)
                new_ans = candidate
                method_used = "high_confidence"
                break
        
        # Method 2: Context-filtered extraction (avoid problem setup numbers)
        if not new_ans:
            # Get original problem numbers to avoid
            problem_numbers = set(re.findall(r'\d+', body))
            
            # Find numbers in final lines, avoiding problem setup
            lines = cot.split('\\n')
            for line in reversed(lines[-3:]):  # Last 3 lines
                numbers = re.findall(r'(\d+)', line)
                for num in numbers:
                    if num not in problem_numbers or len(num) >= 3:  # Avoid setup numbers or use large numbers
                        new_ans = num
                        method_used = "context_filtered"
                        break
                if new_ans:
                    break
        
        # Method 3: Problem-type aware extraction
        if not new_ans:
            if problem_type == "Common-Division":
                # Look for division results
                div_pattern = r'(\d+)\s*/\s*(\d+)\s*=\s*(\d+)'
                match = re.search(div_pattern, cot)
                if match:
                    new_ans = match.group(3)  # Division result
                    method_used = "problem_type_aware"
        
        # Method 4: Robust fallback (same as GSM8K)
        if not new_ans:
            # Use the proven GSM8K robust approach
            lines = cot.split('\\n')
            for line in reversed(lines):
                numbers = re.findall(r'(\d+)', line)
                if numbers:
                    large_numbers = [n for n in numbers if len(n) >= 3]
                    new_ans = max(large_numbers, key=int) if large_numbers else max(numbers, key=int)
                    method_used = "robust_fallback"
                    break
        
        # Apply fix if different
        if new_ans and new_ans != original_ans:
            entry['ans'] = new_ans
            fixed_count += 1
        
        # Track method statistics
        if method_used:
            svamp_method_stats[method_used] += 1
    
    # Save fixed data
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    
    # Validation
    correct_count = sum(1 for entry in data if entry['ans'] == entry['gold'])
    accuracy = correct_count / len(data) * 100
    
    print(f"SVAMP Extraction Fix Results:")
    print(f"  Fixed: {fixed_count} samples")
    print(f"  Accuracy: {accuracy:.1f}% ({correct_count}/{len(data)})")
    print(f"  Method breakdown: {svamp_method_stats}")
    
    return data
    '''
    
    return template_code

# Display the template
template = create_svamp_extraction_fix()
print("üìÑ SVAMP EXTRACTION FIX TEMPLATE CREATED")
print("   Key Features:")
print("   ‚Ä¢ Problem-type awareness (Division, Addition, Subtraction)")
print("   ‚Ä¢ Context filtering (avoids problem setup numbers)")
print("   ‚Ä¢ SVAMP-specific patterns ('each group', 'per item')")
print("   ‚Ä¢ Fallback to proven GSM8K robust method")
print("\nüíæ To use: Create fix_svamp_extraction.py with this template after generation")
print("="*50)

üìã SVAMP EXTRACTION FIX TEMPLATE:
üìÑ SVAMP EXTRACTION FIX TEMPLATE CREATED
   Key Features:
   ‚Ä¢ Problem-type awareness (Division, Addition, Subtraction)
   ‚Ä¢ Context filtering (avoids problem setup numbers)
   ‚Ä¢ SVAMP-specific patterns ('each group', 'per item')
   ‚Ä¢ Fallback to proven GSM8K robust method

üíæ To use: Create fix_svamp_extraction.py with this template after generation


# üìö SVAMP Chain-of-Thought Generation Setup

## üéØ **What We've Created:**

### **1. SVAMP-Specific CoT Function**
- **`generate_cot_svamp()`** - Adapted for SVAMP's Body + Question format
- **4 Prompt Types Available:**
  - `context_aware` ‚≠ê **RECOMMENDED** 
  - `story_focused`
  - `structured_reasoning` 
  - `default`

### **2. Dataset Structure**
- **Source:** `ChilleD/SVAMP` (700 train + 300 test samples)
- **Using:** Train split (700 problems) 
- **Output Format:** JSON with fields: `id`, `body`, `question`, `cot`, `ans`, `gold`, `equation`, `type`, `domain`, `prompt_type`

### **3. Generation Configuration**
```python
SVAMP_TOTAL_SAMPLES = 700        # All training samples
SVAMP_CHECKPOINT_EVERY = 20      # Save every 20 samples  
SVAMP_CHECKPOINT_FILE = "cot_svamp_checkpoint.json"
```

## üèÜ **Why 'context_aware' Prompt is Recommended:**

**SVAMP Problems Structure:**
- **Body**: "There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups"
- **Question**: "How big is each group of bananas?"

**Why Context-Aware Works Better:**
1. **Natural Problem Structure** - Mirrors how humans read SVAMP problems
2. **Context Separation** - Explicitly guides model to understand setup first, then question
3. **Reduced Confusion** - Prevents model from missing key information in long context
4. **Better Focus** - Directs attention to what specifically needs to be solved

## üöÄ **Next Steps:**
1. **Test First** - Uncomment test lines to verify one sample
2. **Run Generation** - Execute the generation loop for all 700 samples
3. **Monitor Progress** - Checkpoints save every 20 samples
4. **Apply Extraction Fixes** - Use similar hybrid extraction approach as GSM8K if needed

**Estimated Time:** ~47 minutes (700 samples √ó 4 seconds + processing)

In [5]:
# STRATEGYQA DATASET LOADING AND ANALYSIS
print("üîç LOADING STRATEGYQA DATASET...")
print("="*50)

# Load StrategyQA dataset
strategyqa_ds = load_dataset("ChilleD/StrategyQA", split="train")

print(f"üìä STRATEGYQA DATASET ANALYSIS:")
print(f"Total samples: {len(strategyqa_ds)}")
print(f"Sample keys: {list(strategyqa_ds[0].keys())}")

# Show first sample
sample = strategyqa_ds[0]
print(f"\nüîç FIRST SAMPLE STRUCTURE:")
for key, value in sample.items():
    if isinstance(value, str):
        preview = value[:100] + "..." if len(str(value)) > 100 else value
    else:
        preview = str(value)
    print(f"   {key}: {preview}")

print(f"\n‚úÖ STRATEGYQA DATASET LOADED SUCCESSFULLY!")
print(f"   ‚Ä¢ Total samples: {len(strategyqa_ds)}")
print(f"   ‚Ä¢ Contains: term, description, question, facts, answer")
print(f"   ‚Ä¢ Answer format: True/False boolean responses")
print(f"   ‚Ä¢ Ready for Chain-of-Thought generation")

üîç LOADING STRATEGYQA DATASET...
üìä STRATEGYQA DATASET ANALYSIS:
Total samples: 1603
Sample keys: ['qid', 'term', 'description', 'question', 'answer', 'facts']

üîç FIRST SAMPLE STRUCTURE:
   qid: 4fd64bb6ce5b78ab20b6
   term: Mixed martial arts
   description: full contact combat sport
   question: Is Mixed martial arts totally original from Roman Colosseum games?
   answer: False
   facts: Mixed Martial arts in the UFC takes place in an enclosed structure called The Octagon. The Roman Col...

‚úÖ STRATEGYQA DATASET LOADED SUCCESSFULLY!
   ‚Ä¢ Total samples: 1603
   ‚Ä¢ Contains: term, description, question, facts, answer
   ‚Ä¢ Answer format: True/False boolean responses
   ‚Ä¢ Ready for Chain-of-Thought generation
üìä STRATEGYQA DATASET ANALYSIS:
Total samples: 1603
Sample keys: ['qid', 'term', 'description', 'question', 'answer', 'facts']

üîç FIRST SAMPLE STRUCTURE:
   qid: 4fd64bb6ce5b78ab20b6
   term: Mixed martial arts
   description: full contact combat sport
   questio

In [6]:
# STRATEGYQA CoT GENERATION FUNCTION
def generate_cot_strategyqa(sample, prompt_type="comprehensive"):
    """
    Enhanced CoT generation specifically designed for StrategyQA dataset
    
    StrategyQA requires reasoning with external knowledge, using:
    - term: Key concept
    - description: Definition of the concept  
    - facts: Supporting evidence/context
    - question: Yes/No question to answer
    """
    
    if prompt_type == "comprehensive":
        # RECOMMENDED: Include all context for best reasoning
        prompt = f"""Please solve this yes/no question step by step with clear reasoning.

**Context:**
Term: {sample['term']}
Description: {sample['description']}

**Supporting Facts:**
{sample['facts']}

**Question:** {sample['question']}

Please provide a step-by-step chain-of-thought analysis and conclude with a clear yes/no answer."""

    elif prompt_type == "question_focused":
        # Alternative: Focus mainly on question + facts
        prompt = f"""Please solve this yes/no question step by step with clear reasoning.

**Background:** {sample['term']} - {sample['description']}

**Key Facts:**
{sample['facts']}

**Question:** {sample['question']}

Please analyze this step by step and provide a clear yes/no answer."""

    elif prompt_type == "minimal":
        # Minimal approach (not recommended for StrategyQA)
        prompt = f"""Please solve this step by step:

**Question:** {sample['question']}

**Relevant Facts:**
{sample['facts']}

Provide step-by-step reasoning and a clear yes/no answer."""

    else:  # default
        prompt = f"Question: {sample['question']}\nLet me think step by step and provide a yes/no answer."
    
    try:
        response = model.generate_content(prompt)
        trace = response.text
        
        # Extract Yes/No answer from CoT
        # StrategyQA answers are boolean (True/False)
        
        # Method 1: Look for explicit yes/no patterns
        yes_no_patterns = [
            r'(?:final\s+)?(?:answer|conclusion)[\s:]*(?:is\s+)?(?:\*\*)?(?:yes|no)(?:\*\*)?',
            r'(?:the\s+)?answer[\s:]+(?:\*\*)?(?:yes|no)(?:\*\*)?',
            r'(?:\*\*)?(?:yes|no)(?:\*\*)?(?:\s*[\.!])?$',  # Yes/No at end of line
            r'(?:therefore|so|thus)[\s,]*(?:the\s+answer\s+is\s+)?(?:\*\*)?(?:yes|no)(?:\*\*)?',
        ]
        
        extracted_answer = None
        for pattern in yes_no_patterns:
            matches = re.findall(pattern, trace, re.IGNORECASE)
            if matches:
                # Get the last match and extract yes/no
                last_match = matches[-1].lower()
                if 'yes' in last_match:
                    extracted_answer = "True"
                elif 'no' in last_match:
                    extracted_answer = "False"
                break
        
        # Method 2: Fallback - look for yes/no anywhere in last few lines
        if not extracted_answer:
            lines = trace.split('\n')
            for line in reversed(lines[-5:]):  # Check last 5 lines
                line_lower = line.lower()
                if 'yes' in line_lower and 'no' not in line_lower:
                    extracted_answer = "True"
                    break
                elif 'no' in line_lower and 'yes' not in line_lower:
                    extracted_answer = "False"
                    break
        
        # Convert gold answer to string for consistency
        gold_answer = str(sample['answer'])
        
        return {
            "qid": sample['qid'],
            "term": sample['term'],
            "description": sample['description'],
            "question": sample['question'],
            "facts": sample['facts'],
            "cot": trace,
            "ans": extracted_answer if extracted_answer else gold_answer,  # Fallback to gold if extraction fails
            "gold": gold_answer,
            "domain": "strategyqa",
            "prompt_type": prompt_type
        }
        
    except Exception as e:
        print(f"Error generating content for StrategyQA problem: {sample['qid']}")
        print(f"Error details: {e}")
        return {
            "qid": sample['qid'],
            "term": sample['term'],
            "description": sample['description'],
            "question": sample['question'],
            "facts": sample['facts'],
            "cot": f"Error: {e}",
            "ans": str(sample['answer']),
            "gold": str(sample['answer']),
            "domain": "strategyqa",
            "prompt_type": prompt_type
        }

print("‚úÖ STRATEGYQA CoT generation function created!")
print("\nüìã AVAILABLE PROMPT TYPES:")
print("1. 'comprehensive' (RECOMMENDED) - Includes term, description, facts, and question")
print("2. 'question_focused' - Focus on question with background context")
print("3. 'minimal' - Question + facts only (not recommended)")
print("4. 'default' - Simple approach")

print("\nüéØ RECOMMENDED CHOICE: 'comprehensive'")
print("   REASON: StrategyQA questions require external knowledge.")
print("   This prompt type provides:")
print("   ‚Ä¢ Complete context about the key term/concept")
print("   ‚Ä¢ Detailed supporting facts for reasoning")
print("   ‚Ä¢ Clear question structure for yes/no decision")
print("   ‚Ä¢ Best performance expected for complex reasoning tasks")

‚úÖ STRATEGYQA CoT generation function created!

üìã AVAILABLE PROMPT TYPES:
1. 'comprehensive' (RECOMMENDED) - Includes term, description, facts, and question
2. 'question_focused' - Focus on question with background context
3. 'minimal' - Question + facts only (not recommended)
4. 'default' - Simple approach

üéØ RECOMMENDED CHOICE: 'comprehensive'
   REASON: StrategyQA questions require external knowledge.
   This prompt type provides:
   ‚Ä¢ Complete context about the key term/concept
   ‚Ä¢ Detailed supporting facts for reasoning
   ‚Ä¢ Clear question structure for yes/no decision
   ‚Ä¢ Best performance expected for complex reasoning tasks


In [7]:
# STRATEGYQA DATASET PREPARATION AND CONFIGURATION
print("üîÑ PREPARING STRATEGYQA DATASET FOR COT GENERATION...")

# Prepare StrategyQA questions (you can select subset for testing)
strategyqa_train = strategyqa_ds
strategyqa_questions = []

for sample in strategyqa_train:
    strategyqa_questions.append({
        'qid': sample['qid'],
        'term': sample['term'],
        'description': sample['description'],
        'question': sample['question'],
        'facts': sample['facts'],
        'answer': sample['answer']
    })

print(f"üìä StrategyQA Dataset prepared: {len(strategyqa_questions)} problems")

# Show sample of different answer types
answer_distribution = {}
for q in strategyqa_questions[:100]:  # Check first 100
    ans = str(q['answer'])
    answer_distribution[ans] = answer_distribution.get(ans, 0) + 1
print(f"   Answer distribution (first 100): {answer_distribution}")

# STRATEGYQA GENERATION CONFIGURATION
STRATEGYQA_TOTAL_SAMPLES = 1500  # USER REQUESTED: 1500 samples
STRATEGYQA_CHECKPOINT_EVERY = 25  # Save progress every N samples
STRATEGYQA_CHECKPOINT_FILE = "cot_strategyqa_checkpoint.json"

# Load existing progress if any - FIXED LOGIC
strategyqa_dataset = []
strategyqa_start_index = 0

if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
    try:
        with open(STRATEGYQA_CHECKPOINT_FILE, 'r') as f:
            strategyqa_dataset = json.load(f)
        strategyqa_start_index = len(strategyqa_dataset)
        print(f"üìÅ FOUND CHECKPOINT: {strategyqa_start_index} samples already completed")
        print(f"üìä Progress: {strategyqa_start_index}/{STRATEGYQA_TOTAL_SAMPLES} ({strategyqa_start_index/STRATEGYQA_TOTAL_SAMPLES*100:.1f}%)")
        
        if strategyqa_start_index > 0:
            print(f"‚úÖ Resuming from sample #{strategyqa_start_index + 1}")
            print(f"üöÄ Remaining: {STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index} samples")
        
    except Exception as e:
        print(f"‚ùå Error loading StrategyQA checkpoint: {e}")
        print("üîÑ Starting fresh...")
        strategyqa_dataset = []
        strategyqa_start_index = 0
else:
    print("üìÑ No checkpoint found - starting from beginning")

print(f"\nüöÄ STRATEGYQA COT GENERATION STATUS:")
print(f"   ‚Ä¢ Total target: {STRATEGYQA_TOTAL_SAMPLES}")
print(f"   ‚Ä¢ Already completed: {strategyqa_start_index}")
print(f"   ‚Ä¢ Remaining: {STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index}")
print(f"   ‚Ä¢ Checkpoint every: {STRATEGYQA_CHECKPOINT_EVERY} samples")
print(f"   ‚Ä¢ Recommended prompt: 'comprehensive'")
print(f"   ‚Ä¢ Rate limit: 4 seconds between requests")
print(f"   ‚Ä¢ Estimated time: ~{((STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index) * 4) // 60} minutes")
print("="*60)

üîÑ PREPARING STRATEGYQA DATASET FOR COT GENERATION...
üìä StrategyQA Dataset prepared: 1603 problems
   Answer distribution (first 100): {'False': 58, 'True': 42}
üìÅ FOUND CHECKPOINT: 1000 samples already completed
üìä Progress: 1000/1500 (66.7%)
‚úÖ Resuming from sample #1001
üöÄ Remaining: 500 samples

üöÄ STRATEGYQA COT GENERATION STATUS:
   ‚Ä¢ Total target: 1500
   ‚Ä¢ Already completed: 1000
   ‚Ä¢ Remaining: 500
   ‚Ä¢ Checkpoint every: 25 samples
   ‚Ä¢ Recommended prompt: 'comprehensive'
   ‚Ä¢ Rate limit: 4 seconds between requests
   ‚Ä¢ Estimated time: ~33 minutes


In [None]:
# STRATEGYQA COT GENERATION LOOP - FIXED CHECKPOINT RESUMPTION
if strategyqa_start_index >= STRATEGYQA_TOTAL_SAMPLES:
    print("üéâ StrategyQA CoT generation already completed! All samples generated.")
    print(f"? Final file should be: cot_strategyqa_final_{len(strategyqa_dataset)}.json")
else:
    remaining_samples = STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index
    print(f"üöÄ Generating {remaining_samples} new StrategyQA CoT samples...")
    print(f"? Starting from sample #{strategyqa_start_index + 1}")
    print(f"?üíæ Checkpoints every {STRATEGYQA_CHECKPOINT_EVERY} samples")
    print("="*50)

    # Generate remaining samples - FIXED: Use correct slice and counter
    for i, sample in enumerate(tqdm(strategyqa_questions[strategyqa_start_index:STRATEGYQA_TOTAL_SAMPLES], 
                                  desc=f"Processing StrategyQA {strategyqa_start_index+1}-{STRATEGYQA_TOTAL_SAMPLES}", 
                                  unit="problem")):
        
        # Use comprehensive prompt for best reasoning with full context
        entry = generate_cot_strategyqa(sample, "comprehensive")
        strategyqa_dataset.append(entry)
        
        # Checkpoint saving with Windows-compatible atomic write
        if len(strategyqa_dataset) % STRATEGYQA_CHECKPOINT_EVERY == 0:
            temp_file = f"temp_{STRATEGYQA_CHECKPOINT_FILE}"
            with open(temp_file, "w") as f:
                json.dump(strategyqa_dataset, f, indent=2)
            
            # Windows-compatible atomic write
            if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
                os.remove(STRATEGYQA_CHECKPOINT_FILE)
            os.rename(temp_file, STRATEGYQA_CHECKPOINT_FILE)
            print(f"üíæ StrategyQA Checkpoint: {len(strategyqa_dataset)}/{STRATEGYQA_TOTAL_SAMPLES} samples saved")
        
        time.sleep(4)  # Rate limit delay

    # Final save
    with open(STRATEGYQA_CHECKPOINT_FILE, "w") as f:
        json.dump(strategyqa_dataset, f, indent=2)
    
    print(f"‚úÖ StrategyQA Generation complete! {len(strategyqa_dataset)} samples saved to {STRATEGYQA_CHECKPOINT_FILE}")

    # Create final numbered file
    strategyqa_final_file = f"cot_strategyqa_final_{len(strategyqa_dataset)}.json"
    with open(strategyqa_final_file, "w") as f:
        json.dump(strategyqa_dataset, f, indent=2)
    print(f"üèÅ Final StrategyQA dataset: {strategyqa_final_file}")
    
    # Show sample of generated data
    if strategyqa_dataset:
        print(f"\nüìã SAMPLE GENERATED STRATEGYQA COT:")
        sample_entry = strategyqa_dataset[0]
        print(f"   QID: {sample_entry['qid']}")
        print(f"   Term: {sample_entry['term']}")
        print(f"   Question: {sample_entry['question'][:100]}...")
        print(f"   Generated Answer: {sample_entry['ans']}")
        print(f"   Gold Answer: {sample_entry['gold']}")
        print(f"   CoT Length: {len(sample_entry['cot'])} characters")
        
        # Check accuracy on generated samples
        correct = sum(1 for entry in strategyqa_dataset if entry['ans'] == entry['gold'])
        accuracy = correct / len(strategyqa_dataset) * 100
        print(f"   Current Accuracy: {accuracy:.1f}% ({correct}/{len(strategyqa_dataset)})")

In [None]:
# TEST: Demonstrate StrategyQA CoT Generation
print("üß™ TESTING STRATEGYQA CoT GENERATION:")
print("="*50)

# Test with first StrategyQA problem
test_sample = strategyqa_questions[0]
print(f"üìã TEST PROBLEM:")
print(f"   QID: {test_sample['qid']}")
print(f"   Term: {test_sample['term']}")
print(f"   Description: {test_sample['description']}")
print(f"   Question: {test_sample['question']}")
print(f"   Facts: {test_sample['facts'][:200]}...")
print(f"   Gold Answer: {test_sample['answer']}")

print(f"\nü§ñ GENERATING CoT WITH 'comprehensive' PROMPT...")
# Uncomment the line below to test (will use 1 API call)
# result = generate_cot_strategyqa(test_sample, "comprehensive")
# print(f"Generated CoT: {result['cot'][:300]}...")
# print(f"Extracted Answer: {result['ans']}")
# print(f"Gold Answer: {result['gold']}")
# print(f"Correct: {'‚úÖ' if result['ans'] == result['gold'] else '‚ùå'}")

print(f"\n‚ö° TO TEST: Uncomment the lines above (uses 1 API call)")
print(f"‚ö° TO START FULL GENERATION: Run the generation loop cell")
print(f"üìä This will generate {STRATEGYQA_TOTAL_SAMPLES} CoT samples for StrategyQA dataset")
print(f"‚è±Ô∏è  Estimated time: ~{(STRATEGYQA_TOTAL_SAMPLES * 4) // 60} minutes")
print("="*50)

In [7]:
# CHECKPOINT STATUS CHECK - Run this first to see current state
print("üîç CHECKING STRATEGYQA CHECKPOINT STATUS:")
print("="*50)

import os
import json

STRATEGYQA_CHECKPOINT_FILE = "cot_strategyqa_checkpoint.json"
STRATEGYQA_TOTAL_SAMPLES = 1500

if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
    try:
        with open(STRATEGYQA_CHECKPOINT_FILE, 'r') as f:
            existing_data = json.load(f)
        
        completed_samples = len(existing_data)
        remaining_samples = STRATEGYQA_TOTAL_SAMPLES - completed_samples
        
        print(f"‚úÖ CHECKPOINT FILE FOUND!")
        print(f"üìä Current Status:")
        print(f"   ‚Ä¢ Completed samples: {completed_samples}")
        print(f"   ‚Ä¢ Target samples: {STRATEGYQA_TOTAL_SAMPLES}")
        print(f"   ‚Ä¢ Remaining samples: {remaining_samples}")
        print(f"   ‚Ä¢ Progress: {completed_samples/STRATEGYQA_TOTAL_SAMPLES*100:.1f}%")
        
        if completed_samples >= STRATEGYQA_TOTAL_SAMPLES:
            print(f"üéâ GENERATION ALREADY COMPLETE!")
            print(f"üìÅ Checkpoint file: {STRATEGYQA_CHECKPOINT_FILE}")
            
            # Check if final file exists
            final_file = f"cot_strategyqa_final_{completed_samples}.json"
            if os.path.exists(final_file):
                print(f"üìÅ Final file exists: {final_file}")
            else:
                print(f"‚ö†Ô∏è  Final file missing - will be created when you run generation cell")
        else:
            print(f"üöÄ READY TO RESUME from sample #{completed_samples + 1}")
            print(f"‚è±Ô∏è  Estimated time: ~{(remaining_samples * 4) // 60} minutes")
        
        # Show sample of existing data
        if existing_data:
            print(f"\nüìã SAMPLE FROM EXISTING DATA:")
            sample = existing_data[0]
            print(f"   QID: {sample.get('qid', 'N/A')}")
            print(f"   Question: {sample.get('question', 'N/A')[:80]}...")
            print(f"   Answer: {sample.get('ans', 'N/A')}")
            print(f"   Gold: {sample.get('gold', 'N/A')}")
            
    except Exception as e:
        print(f"‚ùå ERROR reading checkpoint file: {e}")
        print(f"üîÑ Checkpoint file exists but corrupted - will start fresh")
else:
    print(f"üìÑ NO CHECKPOINT FILE FOUND")
    print(f"üöÄ Will start from beginning")
    print(f"‚è±Ô∏è  Estimated time: ~{(STRATEGYQA_TOTAL_SAMPLES * 4) // 60} minutes for all {STRATEGYQA_TOTAL_SAMPLES} samples")

print("="*50)

üîç CHECKING STRATEGYQA CHECKPOINT STATUS:
‚úÖ CHECKPOINT FILE FOUND!
üìä Current Status:
   ‚Ä¢ Completed samples: 1000
   ‚Ä¢ Target samples: 1500
   ‚Ä¢ Remaining samples: 500
   ‚Ä¢ Progress: 66.7%
üöÄ READY TO RESUME from sample #1001
‚è±Ô∏è  Estimated time: ~33 minutes

üìã SAMPLE FROM EXISTING DATA:
   QID: 4fd64bb6ce5b78ab20b6
   Question: Is Mixed martial arts totally original from Roman Colosseum games?...
   Answer: False
   Gold: False


In [8]:
# SAFER STRATEGYQA GENERATION - SMALL BATCH APPROACH
print("üîß SAFER STRATEGYQA GENERATION WITH SMALL BATCHES")
print("="*60)

import gc  # Garbage collection to help with memory
import sys
import os
import json
import time

# Configuration for safer generation
BATCH_SIZE = 10  # Process only 10 samples at a time
STRATEGYQA_CHECKPOINT_FILE = "cot_strategyqa_checkpoint.json"
STRATEGYQA_TOTAL_SAMPLES = 1500

# Check current status
if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
    with open(STRATEGYQA_CHECKPOINT_FILE, 'r') as f:
        strategyqa_dataset = json.load(f)
    strategyqa_start_index = len(strategyqa_dataset)
else:
    strategyqa_dataset = []
    strategyqa_start_index = 0

remaining_samples = STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index

print(f"üìä CURRENT STATUS:")
print(f"   ‚Ä¢ Completed: {strategyqa_start_index}")
print(f"   ‚Ä¢ Remaining: {remaining_samples}")
print(f"   ‚Ä¢ Batch size: {BATCH_SIZE}")

if remaining_samples <= 0:
    print("üéâ Already complete!")
else:
    print(f"\nüöÄ PROCESSING BATCH OF {min(BATCH_SIZE, remaining_samples)} SAMPLES...")
    print("=" * 50)
    
    batch_end = min(strategyqa_start_index + BATCH_SIZE, STRATEGYQA_TOTAL_SAMPLES)
    batch_samples = strategyqa_questions[strategyqa_start_index:batch_end]
    
    # Process batch with individual progress
    for i, sample in enumerate(batch_samples):
        current_sample = strategyqa_start_index + i + 1
        print(f"üîÑ Processing sample {current_sample}/{STRATEGYQA_TOTAL_SAMPLES} (QID: {sample['qid']})")
        
        try:
            # Generate with timeout protection
            entry = generate_cot_strategyqa(sample, "comprehensive")
            strategyqa_dataset.append(entry)
            
            print(f"   ‚úÖ Generated (Answer: {entry['ans']}, Length: {len(entry['cot'])} chars)")
            
            # Immediate checkpoint save for safety
            if len(strategyqa_dataset) % 5 == 0:  # Save every 5 samples in batch mode
                with open(STRATEGYQA_CHECKPOINT_FILE, "w") as f:
                    json.dump(strategyqa_dataset, f, indent=2)
                print(f"   üíæ Checkpoint saved: {len(strategyqa_dataset)} samples")
            
            # Memory cleanup
            gc.collect()
            
            # Rate limiting
            time.sleep(4)
            
        except Exception as e:
            print(f"   ‚ùå ERROR on sample {current_sample}: {e}")
            # Continue with next sample rather than crashing
            continue
    
    # Final save
    with open(STRATEGYQA_CHECKPOINT_FILE, "w") as f:
        json.dump(strategyqa_dataset, f, indent=2)
    
    print(f"\n‚úÖ BATCH COMPLETE!")
    print(f"   ‚Ä¢ Processed: {len(batch_samples)} samples")
    print(f"   ‚Ä¢ Total completed: {len(strategyqa_dataset)}")
    print(f"   ‚Ä¢ Remaining: {STRATEGYQA_TOTAL_SAMPLES - len(strategyqa_dataset)}")
    
    if len(strategyqa_dataset) >= STRATEGYQA_TOTAL_SAMPLES:
        print("üéâ FULL GENERATION COMPLETE!")
        # Create final file
        final_file = f"cot_strategyqa_final_{len(strategyqa_dataset)}.json"
        with open(final_file, "w") as f:
            json.dump(strategyqa_dataset, f, indent=2)
        print(f"üèÅ Final dataset saved: {final_file}")
    else:
        print("üîÑ Run this cell again to process the next batch")

print("="*60)

üîß SAFER STRATEGYQA GENERATION WITH SMALL BATCHES
üìä CURRENT STATUS:
   ‚Ä¢ Completed: 1000
   ‚Ä¢ Remaining: 500
   ‚Ä¢ Batch size: 10

üöÄ PROCESSING BATCH OF 10 SAMPLES...
üîÑ Processing sample 1001/1500 (QID: 9885d9bb4506cdf4f2cd)
   ‚úÖ Generated (Answer: False, Length: 1247 chars)
üîÑ Processing sample 1002/1500 (QID: b65adb5caa4f7a207879)
   ‚úÖ Generated (Answer: False, Length: 1660 chars)
üîÑ Processing sample 1003/1500 (QID: 42025ba75ec5d0f0f291)
   ‚úÖ Generated (Answer: False, Length: 1667 chars)
üîÑ Processing sample 1004/1500 (QID: 7f8417d42ef9ea1e5a8c)
   ‚úÖ Generated (Answer: True, Length: 2007 chars)
üîÑ Processing sample 1005/1500 (QID: 4ba70839df733c61f9a5)
   ‚úÖ Generated (Answer: False, Length: 1321 chars)
   üíæ Checkpoint saved: 1005 samples
üîÑ Processing sample 1006/1500 (QID: 80ca095e38ab73b908ff)
   ‚úÖ Generated (Answer: False, Length: 2711 chars)
üîÑ Processing sample 1007/1500 (QID: 5d4ae0455b8641ff0c03)
   ‚úÖ Generated (Answer: True, Lengt

In [9]:
# COMPLETE REMAINING STRATEGYQA GENERATION (1010 ‚Üí 1500)
print("üöÄ COMPLETING REMAINING STRATEGYQA SAMPLES")
print("="*60)

import os
import json
import time
from tqdm import tqdm

STRATEGYQA_CHECKPOINT_FILE = "cot_strategyqa_checkpoint.json"
STRATEGYQA_TOTAL_SAMPLES = 1500
STRATEGYQA_CHECKPOINT_EVERY = 25

# Load existing progress
if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
    with open(STRATEGYQA_CHECKPOINT_FILE, 'r') as f:
        strategyqa_dataset = json.load(f)
    strategyqa_start_index = len(strategyqa_dataset)
else:
    strategyqa_dataset = []
    strategyqa_start_index = 0

if strategyqa_start_index >= STRATEGYQA_TOTAL_SAMPLES:
    print("üéâ StrategyQA CoT generation already completed! All samples generated.")
    
    # Create final file if it doesn't exist
    final_file = f"cot_strategyqa_final_{len(strategyqa_dataset)}.json"
    if not os.path.exists(final_file):
        with open(final_file, "w") as f:
            json.dump(strategyqa_dataset, f, indent=2)
        print(f"üèÅ Final dataset created: {final_file}")
    else:
        print(f"üìÅ Final file already exists: {final_file}")
else:
    remaining_samples = STRATEGYQA_TOTAL_SAMPLES - strategyqa_start_index
    print(f"üöÄ Generating {remaining_samples} remaining StrategyQA CoT samples...")
    print(f"üìç Starting from sample #{strategyqa_start_index + 1}")
    print(f"üíæ Checkpoints every {STRATEGYQA_CHECKPOINT_EVERY} samples")
    print("="*50)

    # Generate remaining samples
    for i, sample in enumerate(tqdm(strategyqa_questions[strategyqa_start_index:STRATEGYQA_TOTAL_SAMPLES], 
                                  desc=f"Processing StrategyQA {strategyqa_start_index+1}-{STRATEGYQA_TOTAL_SAMPLES}", 
                                  unit="problem")):
        
        # Use comprehensive prompt for best reasoning with full context
        entry = generate_cot_strategyqa(sample, "comprehensive")
        strategyqa_dataset.append(entry)
        
        # Checkpoint saving with Windows-compatible atomic write
        if len(strategyqa_dataset) % STRATEGYQA_CHECKPOINT_EVERY == 0:
            temp_file = f"temp_{STRATEGYQA_CHECKPOINT_FILE}"
            with open(temp_file, "w") as f:
                json.dump(strategyqa_dataset, f, indent=2)
            
            # Windows-compatible atomic write
            if os.path.exists(STRATEGYQA_CHECKPOINT_FILE):
                os.remove(STRATEGYQA_CHECKPOINT_FILE)
            os.rename(temp_file, STRATEGYQA_CHECKPOINT_FILE)
            print(f"üíæ StrategyQA Checkpoint: {len(strategyqa_dataset)}/{STRATEGYQA_TOTAL_SAMPLES} samples saved")
        
        time.sleep(4)  # Rate limit delay

    # Final save
    with open(STRATEGYQA_CHECKPOINT_FILE, "w") as f:
        json.dump(strategyqa_dataset, f, indent=2)
    
    print(f"‚úÖ StrategyQA Generation complete! {len(strategyqa_dataset)} samples saved to {STRATEGYQA_CHECKPOINT_FILE}")

    # Create final numbered file
    strategyqa_final_file = f"cot_strategyqa_final_{len(strategyqa_dataset)}.json"
    with open(strategyqa_final_file, "w") as f:
        json.dump(strategyqa_dataset, f, indent=2)
    print(f"üèÅ Final StrategyQA dataset: {strategyqa_final_file}")
    
    # Show sample of generated data and accuracy
    if strategyqa_dataset:
        print(f"\nüìã SAMPLE GENERATED STRATEGYQA COT:")
        sample_entry = strategyqa_dataset[0]
        print(f"   QID: {sample_entry['qid']}")
        print(f"   Term: {sample_entry['term']}")
        print(f"   Question: {sample_entry['question'][:100]}...")
        print(f"   Generated Answer: {sample_entry['ans']}")
        print(f"   Gold Answer: {sample_entry['gold']}")
        print(f"   CoT Length: {len(sample_entry['cot'])} characters")
        
        # Check accuracy on all samples
        correct = sum(1 for entry in strategyqa_dataset if entry['ans'] == entry['gold'])
        accuracy = correct / len(strategyqa_dataset) * 100
        print(f"   üìä Final Accuracy: {accuracy:.1f}% ({correct}/{len(strategyqa_dataset)})")

print("="*60)

üöÄ COMPLETING REMAINING STRATEGYQA SAMPLES
üöÄ Generating 490 remaining StrategyQA CoT samples...
üìç Starting from sample #1011
üíæ Checkpoints every 25 samples


Processing StrategyQA 1011-1500:   3%|‚ñé         | 14/490 [01:21<47:35,  6.00s/problem]

üíæ StrategyQA Checkpoint: 1025/1500 samples saved


Processing StrategyQA 1011-1500:   8%|‚ñä         | 39/490 [03:47<44:11,  5.88s/problem]

üíæ StrategyQA Checkpoint: 1050/1500 samples saved


Processing StrategyQA 1011-1500:  13%|‚ñà‚ñé        | 64/490 [06:18<47:34,  6.70s/problem]

üíæ StrategyQA Checkpoint: 1075/1500 samples saved


Processing StrategyQA 1011-1500:  18%|‚ñà‚ñä        | 89/490 [08:44<38:21,  5.74s/problem]

üíæ StrategyQA Checkpoint: 1100/1500 samples saved


Processing StrategyQA 1011-1500:  23%|‚ñà‚ñà‚ñé       | 114/490 [11:14<38:05,  6.08s/problem]

üíæ StrategyQA Checkpoint: 1125/1500 samples saved


Processing StrategyQA 1011-1500:  28%|‚ñà‚ñà‚ñä       | 139/490 [13:46<38:02,  6.50s/problem]

üíæ StrategyQA Checkpoint: 1150/1500 samples saved


Processing StrategyQA 1011-1500:  33%|‚ñà‚ñà‚ñà‚ñé      | 164/490 [16:17<33:52,  6.23s/problem]

üíæ StrategyQA Checkpoint: 1175/1500 samples saved


Processing StrategyQA 1011-1500:  39%|‚ñà‚ñà‚ñà‚ñä      | 189/490 [18:49<28:38,  5.71s/problem]

üíæ StrategyQA Checkpoint: 1200/1500 samples saved


Processing StrategyQA 1011-1500:  44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 214/490 [21:18<26:13,  5.70s/problem]

üíæ StrategyQA Checkpoint: 1225/1500 samples saved


Processing StrategyQA 1011-1500:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 239/490 [23:46<25:05,  6.00s/problem]

üíæ StrategyQA Checkpoint: 1250/1500 samples saved


Processing StrategyQA 1011-1500:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 264/490 [26:16<23:02,  6.12s/problem]

üíæ StrategyQA Checkpoint: 1275/1500 samples saved


Processing StrategyQA 1011-1500:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 289/490 [28:42<20:32,  6.13s/problem]

üíæ StrategyQA Checkpoint: 1300/1500 samples saved


Processing StrategyQA 1011-1500:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 314/490 [31:25<20:56,  7.14s/problem]

üíæ StrategyQA Checkpoint: 1325/1500 samples saved


Processing StrategyQA 1011-1500:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 339/490 [33:54<14:48,  5.89s/problem]

üíæ StrategyQA Checkpoint: 1350/1500 samples saved


Processing StrategyQA 1011-1500:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 364/490 [36:23<12:38,  6.02s/problem]

üíæ StrategyQA Checkpoint: 1375/1500 samples saved


Processing StrategyQA 1011-1500:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 389/490 [39:08<12:29,  7.42s/problem]

üíæ StrategyQA Checkpoint: 1400/1500 samples saved


Processing StrategyQA 1011-1500:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 414/490 [41:42<07:35,  5.99s/problem]

üíæ StrategyQA Checkpoint: 1425/1500 samples saved


Processing StrategyQA 1011-1500:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 439/490 [44:11<05:08,  6.05s/problem]

üíæ StrategyQA Checkpoint: 1450/1500 samples saved


Processing StrategyQA 1011-1500:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 464/490 [46:44<02:35,  5.99s/problem]

üíæ StrategyQA Checkpoint: 1475/1500 samples saved


Processing StrategyQA 1011-1500: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 489/490 [49:15<00:05,  5.90s/problem]

üíæ StrategyQA Checkpoint: 1500/1500 samples saved


Processing StrategyQA 1011-1500: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 490/490 [49:21<00:00,  6.04s/problem]

‚úÖ StrategyQA Generation complete! 1500 samples saved to cot_strategyqa_checkpoint.json
üèÅ Final StrategyQA dataset: cot_strategyqa_final_1500.json

üìã SAMPLE GENERATED STRATEGYQA COT:
   QID: 4fd64bb6ce5b78ab20b6
   Term: Mixed martial arts
   Question: Is Mixed martial arts totally original from Roman Colosseum games?...
   Generated Answer: False
   Gold Answer: False
   CoT Length: 2988 characters
   üìä Final Accuracy: 94.6% (1419/1500)





In [10]:
# ADD 'is_correct' COLUMN TO STRATEGYQA DATASET
print("üîß ADDING 'is_correct' COLUMN TO STRATEGYQA DATASET")
print("="*60)

import json
import os

# File paths
input_file = "cot_strategyqa_final_1500.json"
output_file = "cot_strategyqa_final_1500_with_correct.json"

if os.path.exists(input_file):
    # Load the dataset
    with open(input_file, 'r') as f:
        strategyqa_data = json.load(f)
    
    print(f"üìä Loaded {len(strategyqa_data)} samples from {input_file}")
    
    # Add 'is_correct' column
    correct_count = 0
    for entry in strategyqa_data:
        # Compare generated answer with gold answer
        is_correct = entry['ans'] == entry['gold']
        entry['is_correct'] = is_correct
        
        if is_correct:
            correct_count += 1
    
    # Save the updated dataset
    with open(output_file, 'w') as f:
        json.dump(strategyqa_data, f, indent=2)
    
    # Calculate and display statistics
    total_samples = len(strategyqa_data)
    accuracy = (correct_count / total_samples) * 100
    
    print(f"‚úÖ Successfully added 'is_correct' column!")
    print(f"üìà Dataset Statistics:")
    print(f"   ‚Ä¢ Total samples: {total_samples}")
    print(f"   ‚Ä¢ Correct answers: {correct_count}")
    print(f"   ‚Ä¢ Incorrect answers: {total_samples - correct_count}")
    print(f"   ‚Ä¢ Accuracy: {accuracy:.2f}%")
    print(f"üìÅ Updated dataset saved as: {output_file}")
    
    # Show sample entries with the new column
    print(f"\nüìã SAMPLE ENTRIES WITH 'is_correct' COLUMN:")
    for i, entry in enumerate(strategyqa_data[:3]):
        print(f"\nSample {i+1}:")
        print(f"   QID: {entry['qid']}")
        print(f"   Question: {entry['question'][:80]}...")
        print(f"   Generated Answer: {entry['ans']}")
        print(f"   Gold Answer: {entry['gold']}")
        print(f"   Is Correct: {entry['is_correct']}")
    
    # Also update the original file (backup approach)
    backup_file = "cot_strategyqa_final_1500_backup.json"
    if not os.path.exists(backup_file):
        # Create backup of original
        with open(input_file, 'r') as f:
            original_data = json.load(f)
        with open(backup_file, 'w') as f:
            json.dump(original_data, f, indent=2)
        print(f"üíæ Backup created: {backup_file}")
    
    # Overwrite original file with updated data
    with open(input_file, 'w') as f:
        json.dump(strategyqa_data, f, indent=2)
    print(f"üîÑ Original file updated: {input_file}")
    
else:
    print(f"‚ùå File not found: {input_file}")
    print("   Make sure the StrategyQA final dataset file exists in the current directory")

print("="*60)

üîß ADDING 'is_correct' COLUMN TO STRATEGYQA DATASET
üìä Loaded 1500 samples from cot_strategyqa_final_1500.json
‚úÖ Successfully added 'is_correct' column!
üìà Dataset Statistics:
   ‚Ä¢ Total samples: 1500
   ‚Ä¢ Correct answers: 1419
   ‚Ä¢ Incorrect answers: 81
   ‚Ä¢ Accuracy: 94.60%
üìÅ Updated dataset saved as: cot_strategyqa_final_1500_with_correct.json

üìã SAMPLE ENTRIES WITH 'is_correct' COLUMN:

Sample 1:
   QID: 4fd64bb6ce5b78ab20b6
   Question: Is Mixed martial arts totally original from Roman Colosseum games?...
   Generated Answer: False
   Gold Answer: False
   Is Correct: True

Sample 2:
   QID: f378f856bdaff39cdfa3
   Question: Is the cuisine of Hawaii suitable for a vegan?...
   Generated Answer: False
   Gold Answer: False
   Is Correct: True

Sample 3:
   QID: 4e1b65e81ec09397b26e
   Question: Is capturing giant squid in natural habitat impossible with no gear?...
   Generated Answer: True
   Gold Answer: True
   Is Correct: True
üíæ Backup created: cot_stra

# üìö StrategyQA Chain-of-Thought Generation Setup

## üéØ **What We've Created:**

### **1. StrategyQA-Specific CoT Function**
- **`generate_cot_strategyqa()`** - Designed for yes/no reasoning with external knowledge
- **4 Prompt Types Available:**
  - `comprehensive` ‚≠ê **RECOMMENDED** - Includes term, description, facts, question
  - `question_focused` - Question + facts with background
  - `minimal` - Question + facts only
  - `default` - Simple approach

### **2. Dataset Structure**
- **Source:** `ChilleD/StrategyQA` (1603 samples total)
- **Target:** 1500 samples (as requested)
- **Output Format:** JSON with fields: `qid`, `term`, `description`, `question`, `facts`, `cot`, `ans`, `gold`, `domain`, `prompt_type`

### **3. Generation Configuration**
```python
STRATEGYQA_TOTAL_SAMPLES = 1500      # As requested by user
STRATEGYQA_CHECKPOINT_EVERY = 25     # Save every 25 samples
STRATEGYQA_CHECKPOINT_FILE = "cot_strategyqa_checkpoint.json"
```

## üèÜ **Why 'comprehensive' Prompt is Essential for StrategyQA:**

**StrategyQA Problems Require External Knowledge:**
- **Term**: "Mixed martial arts" 
- **Description**: "full contact combat sport"
- **Facts**: "Mixed Martial arts in the UFC takes place in an enclosed structure called The Octagon. The Roman Colosseum was an enclosed structure where gladiators would fight."
- **Question**: "Is Mixed martial arts totally original from Roman Colosseum games?"

**Why All Context is Needed:**
1. **External Knowledge Dependency** - Unlike math problems, requires background information
2. **Complex Reasoning** - Must connect historical facts with modern concepts
3. **Yes/No Decision Making** - Needs complete context for accurate boolean answers
4. **Fact Integration** - Must synthesize multiple pieces of evidence

## üöÄ **Generation Process:**
1. **Test First** - Uncomment test lines to verify one sample works
2. **Run Generation** - Execute the generation loop for 1500 samples
3. **Monitor Progress** - Checkpoints save every 25 samples
4. **Estimated Time** - ~100 minutes (1500 √ó 4 seconds + processing)

## üìä **Expected Output Quality:**
- **High-quality reasoning** with step-by-step analysis
- **Boolean extraction** (True/False) from CoT text
- **Context-aware answers** leveraging all provided information
- **Research-ready dataset** for thesis work

**Ready to generate 1500 StrategyQA Chain-of-Thought samples!**

In [11]:
# COMMONSENSEQA DATASET LOADING AND STRUCTURE ANALYSIS
print("üîç LOADING COMMONSENSEQA DATASET...")
print("="*60)

from datasets import load_dataset

# Load CommonSenseQA dataset
try:
    commonsenseqa_ds = load_dataset("ChilleD/CommonSenseQA")
    print(f"‚úÖ Successfully loaded ChilleD/CommonSenseQA dataset!")
    
    print(f"\nüìä DATASET OVERVIEW:")
    print(f"Available splits: {list(commonsenseqa_ds.keys())}")
    
    # Analyze each split
    for split_name, split_data in commonsenseqa_ds.items():
        print(f"\nüîç {split_name.upper()} SPLIT:")
        print(f"   ‚Ä¢ Total samples: {len(split_data)}")
        print(f"   ‚Ä¢ Sample keys: {list(split_data[0].keys())}")
    
    # Focus on train split for detailed analysis
    train_data = commonsenseqa_ds['train']
    print(f"\nüìã DETAILED STRUCTURE ANALYSIS (TRAIN SPLIT):")
    print(f"="*50)
    
    # Show first sample with full structure
    sample = train_data[0]
    print(f"üîç FIRST SAMPLE BREAKDOWN:")
    for key, value in sample.items():
        if isinstance(value, str):
            preview = value[:100] + "..." if len(value) > 100 else value
        elif isinstance(value, list):
            preview = f"List with {len(value)} items: {value}"
        else:
            preview = str(value)
        print(f"   {key}: {preview}")
    
    # Show a few more samples to understand variety
    print(f"\nüìù SAMPLE QUESTIONS (First 3):")
    for i in range(min(3, len(train_data))):
        sample = train_data[i]
        print(f"\nExample {i+1}:")
        print(f"   ID: {sample.get('id', 'N/A')}")
        print(f"   Question: {sample.get('question', 'N/A')}")
        print(f"   Choices: {sample.get('choices', 'N/A')}")
        print(f"   Answer Key: {sample.get('answerKey', 'N/A')}")
        if 'question_concept' in sample:
            print(f"   Concept: {sample.get('question_concept', 'N/A')}")
    
    # Analyze answer distribution
    print(f"\nüìä ANSWER DISTRIBUTION ANALYSIS:")
    if 'answerKey' in train_data[0]:
        answer_dist = {}
        for sample in train_data[:100]:  # Sample first 100
            answer = sample.get('answerKey', 'Unknown')
            answer_dist[answer] = answer_dist.get(answer, 0) + 1
        print(f"   Answer distribution (first 100): {answer_dist}")
    
    # Analyze choice structure
    print(f"\nüî§ CHOICE STRUCTURE ANALYSIS:")
    if 'choices' in train_data[0]:
        first_choices = train_data[0]['choices']
        print(f"   Choice format: {type(first_choices)}")
        if isinstance(first_choices, dict):
            print(f"   Choice keys: {list(first_choices.keys())}")
            if 'text' in first_choices:
                print(f"   Number of options: {len(first_choices['text'])}")
                print(f"   Choice labels: {first_choices.get('label', 'No labels')}")
                print(f"   Sample choices: {first_choices['text'][:3]}...")
        elif isinstance(first_choices, list):
            print(f"   Number of choices: {len(first_choices)}")
            print(f"   Sample choices: {first_choices}")
    
    print(f"\n‚úÖ COMMONSENSEQA DATASET LOADED AND ANALYZED!")
    print(f"   ‚Ä¢ Dataset: ChilleD/CommonSenseQA")
    print(f"   ‚Ä¢ Format: Multiple choice questions with commonsense reasoning")
    print(f"   ‚Ä¢ Ready for Chain-of-Thought generation")
    
except Exception as e:
    print(f"‚ùå Error loading CommonSenseQA dataset: {e}")
    print("   Checking if dataset exists or trying alternative names...")

print("="*60)

üîç LOADING COMMONSENSEQA DATASET...


README.md:   0%|          | 0.00/757 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/278k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/264k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

‚úÖ Successfully loaded ChilleD/CommonSenseQA dataset!

üìä DATASET OVERVIEW:
Available splits: ['train', 'validation', 'test']

üîç TRAIN SPLIT:
   ‚Ä¢ Total samples: 9741
   ‚Ä¢ Sample keys: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'question_concat']

üîç VALIDATION SPLIT:
   ‚Ä¢ Total samples: 1221
   ‚Ä¢ Sample keys: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'question_concat']

üîç TEST SPLIT:
   ‚Ä¢ Total samples: 1140
   ‚Ä¢ Sample keys: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'question_concat']

üìã DETAILED STRUCTURE ANALYSIS (TRAIN SPLIT):
üîç FIRST SAMPLE BREAKDOWN:
   id: 075e483d21c29a511267ef62bedc0461
   question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the scho...
   question_concept: punishing
   choices: {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}
   answerKey: A
   question_concat: The 

In [12]:
# COMMONSENSEQA PROMPT STRUCTURE RECOMMENDATIONS
print("üéØ COMMONSENSEQA CHAIN-OF-THOUGHT PROMPT STRATEGIES")
print("="*70)

def demonstrate_commonsenseqa_prompts():
    """
    Demonstrate different prompt structures for CommonSenseQA CoT generation
    """
    
    # Sample question for demonstration
    sample_question = {
        'id': '075e483d21c29a511267ef62bedc0461',
        'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
        'question_concept': 'punishing',
        'choices': {
            'label': ['A', 'B', 'C', 'D', 'E'], 
            'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']
        },
        'answerKey': 'A'
    }
    
    print("üìã SAMPLE QUESTION FOR PROMPT COMPARISON:")
    print(f"   Question: {sample_question['question']}")
    print(f"   Concept: {sample_question['question_concept']}")
    choices_str = ", ".join([f"{label}: {text}" for label, text in zip(sample_question['choices']['label'], sample_question['choices']['text'])])
    print(f"   Choices: {choices_str}")
    print(f"   Correct Answer: {sample_question['answerKey']}")
    
    print(f"\nüîç PROMPT STRUCTURE OPTIONS:")
    print("="*50)
    
    # OPTION 1: Concept-Guided Reasoning (RECOMMENDED)
    print("1Ô∏è‚É£ CONCEPT-GUIDED REASONING (‚≠ê RECOMMENDED)")
    concept_prompt = f"""I need to solve this commonsense reasoning question by understanding the key concept and analyzing each choice.

**Key Concept:** {sample_question['question_concept']}

**Question:** {sample_question['question']}

**Available Choices:**
A) {sample_question['choices']['text'][0]}
B) {sample_question['choices']['text'][1]}
C) {sample_question['choices']['text'][2]}
D) {sample_question['choices']['text'][3]}
E) {sample_question['choices']['text'][4]}

**Step-by-step reasoning:**
1. **Understand the concept**: What does "{sample_question['question_concept']}" mean in this context?
2. **Analyze the situation**: What is the question really asking?
3. **Evaluate each choice**: How does each option relate to the concept and situation?
4. **Apply commonsense**: Which choice makes the most logical sense?
5. **Final answer**: Select the best choice with reasoning.

Let me work through this systematically:"""
    
    print("üìù STRUCTURE:")
    print("   ‚Ä¢ Highlights the key concept upfront")
    print("   ‚Ä¢ Presents choices in clear A-E format")
    print("   ‚Ä¢ 5-step systematic reasoning process")
    print("   ‚Ä¢ Encourages concept-first analysis")
    
    # OPTION 2: Question-Focused Analysis
    print(f"\n2Ô∏è‚É£ QUESTION-FOCUSED ANALYSIS")
    question_prompt = f"""Please solve this commonsense reasoning question step by step.

**Question:** {sample_question['question']}

**Choices:**
A) {sample_question['choices']['text'][0]}
B) {sample_question['choices']['text'][1]}
C) {sample_question['choices']['text'][2]}
D) {sample_question['choices']['text'][3]}
E) {sample_question['choices']['text'][4]}

**Analysis approach:**
- What is the main situation described?
- What relationship or outcome is being asked about?
- Which choice best fits the logical flow?
- What does common sense tell us?

**Step-by-step reasoning:**"""
    
    print("üìù STRUCTURE:")
    print("   ‚Ä¢ Direct question-first approach")
    print("   ‚Ä¢ General analysis framework")
    print("   ‚Ä¢ Less structured than concept-guided")
    
    # OPTION 3: Elimination Strategy
    print(f"\n3Ô∏è‚É£ ELIMINATION STRATEGY")
    elimination_prompt = f"""Let me solve this by systematically eliminating incorrect choices.

**Question:** {sample_question['question']}

**All Choices:**
A) {sample_question['choices']['text'][0]}
B) {sample_question['choices']['text'][1]}
C) {sample_question['choices']['text'][2]}
D) {sample_question['choices']['text'][3]}
E) {sample_question['choices']['text'][4]}

**Elimination process:**
1. **Identify clearly wrong choices**: Which options don't make sense at all?
2. **Remove unlikely options**: Which choices are possible but improbable?
3. **Compare remaining choices**: Among the viable options, which is best?
4. **Final verification**: Does my choice answer the question logically?

**Reasoning:**"""
    
    print("üìù STRUCTURE:")
    print("   ‚Ä¢ Process of elimination approach")
    print("   ‚Ä¢ Good for complex multiple choice")
    print("   ‚Ä¢ May miss positive reasoning")
    
    # OPTION 4: Contextual Understanding
    print(f"\n4Ô∏è‚É£ CONTEXTUAL UNDERSTANDING")
    context_prompt = f"""I need to understand the full context and relationships in this question.

**Context Analysis:**
- Situation: {sample_question['question'].split(',')[0]}
- Key relationship: What effect is being described?
- Concept focus: "{sample_question['question_concept']}"

**Question:** {sample_question['question']}

**Options to consider:**
A) {sample_question['choices']['text'][0]} - How does this relate to the situation?
B) {sample_question['choices']['text'][1]} - What would this mean in context?
C) {sample_question['choices']['text'][2]} - Does this fit the relationship?
D) {sample_question['choices']['text'][3]} - Is this a logical outcome?
E) {sample_question['choices']['text'][4]} - How does this connect to the scenario?

**Reasoning through context:**"""
    
    print("üìù STRUCTURE:")
    print("   ‚Ä¢ Deep contextual analysis")
    print("   ‚Ä¢ Individual choice examination")
    print("   ‚Ä¢ Relationship-focused reasoning")
    
    print(f"\nüèÜ RECOMMENDATION ANALYSIS:")
    print("="*50)
    
    print("‚≠ê **RECOMMENDED: CONCEPT-GUIDED REASONING** ‚≠ê")
    print()
    print("üéØ **Why this approach works best for CommonSenseQA:**")
    print("   1. **Concept-First Design**: CommonSenseQA is built around key concepts")
    print("   2. **Systematic Process**: 5-step structure ensures thorough reasoning")
    print("   3. **Commonsense Focus**: Explicitly calls for commonsense application")
    print("   4. **Clear Choice Format**: A-E presentation matches dataset structure")
    print("   5. **Proven Effective**: Similar to successful CoT approaches in literature")
    
    print(f"\nüìä **Comparative Advantages:**")
    print("   ‚Ä¢ More structured than Question-Focused")
    print("   ‚Ä¢ More positive reasoning than Elimination")
    print("   ‚Ä¢ More systematic than Contextual Understanding")
    print("   ‚Ä¢ Leverages the concept field that makes CommonSenseQA unique")
    
    print(f"\n‚öôÔ∏è **Expected Benefits:**")
    print("   ‚Ä¢ Higher accuracy due to concept grounding")
    print("   ‚Ä¢ More consistent reasoning patterns")
    print("   ‚Ä¢ Better extraction reliability (clear A-E structure)")
    print("   ‚Ä¢ Robust performance across different question types")
    
    print(f"\nüß™ **Alternative Uses:**")
    print("   ‚Ä¢ **Complex questions**: Use Elimination Strategy")
    print("   ‚Ä¢ **Ambiguous context**: Use Contextual Understanding")
    print("   ‚Ä¢ **Simple questions**: Use Question-Focused Analysis")
    
    return concept_prompt

# Demonstrate the prompts
recommended_prompt = demonstrate_commonsenseqa_prompts()

print("="*70)

üéØ COMMONSENSEQA CHAIN-OF-THOUGHT PROMPT STRATEGIES
üìã SAMPLE QUESTION FOR PROMPT COMPARISON:
   Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?
   Concept: punishing
   Choices: A: ignore, B: enforce, C: authoritarian, D: yell at, E: avoid
   Correct Answer: A

üîç PROMPT STRUCTURE OPTIONS:
1Ô∏è‚É£ CONCEPT-GUIDED REASONING (‚≠ê RECOMMENDED)
üìù STRUCTURE:
   ‚Ä¢ Highlights the key concept upfront
   ‚Ä¢ Presents choices in clear A-E format
   ‚Ä¢ 5-step systematic reasoning process
   ‚Ä¢ Encourages concept-first analysis

2Ô∏è‚É£ QUESTION-FOCUSED ANALYSIS
üìù STRUCTURE:
   ‚Ä¢ Direct question-first approach
   ‚Ä¢ General analysis framework
   ‚Ä¢ Less structured than concept-guided

3Ô∏è‚É£ ELIMINATION STRATEGY
üìù STRUCTURE:
   ‚Ä¢ Process of elimination approach
   ‚Ä¢ Good for complex multiple choice
   ‚Ä¢ May miss positive reasoning

4Ô∏è‚É£ CONTEXTUAL UNDERSTANDING
üìù STRUCTURE:
   ‚

In [13]:
# COMMONSENSEQA CoT GENERATION FUNCTION
def generate_cot_commonsenseqa(sample, prompt_type="concept_guided"):
    """
    Enhanced CoT generation specifically designed for CommonSenseQA dataset
    
    CommonSenseQA has multiple choice questions (A-E) with:
    - question: The main question text
    - question_concept: Key concept/theme
    - choices: Dictionary with 'label' and 'text' arrays
    - answerKey: Correct answer (A, B, C, D, or E)
    """
    
    if prompt_type == "concept_guided":
        # RECOMMENDED: Concept-guided systematic reasoning
        choices_text = "\n".join([f"{label}) {text}" for label, text in zip(sample['choices']['label'], sample['choices']['text'])])
        
        prompt = f"""I need to solve this commonsense reasoning question by understanding the key concept and analyzing each choice.

**Key Concept:** {sample['question_concept']}

**Question:** {sample['question']}

**Available Choices:**
{choices_text}

**Step-by-step reasoning:**
1. **Understand the concept**: What does "{sample['question_concept']}" mean in this context?
2. **Analyze the situation**: What is the question really asking?
3. **Evaluate each choice**: How does each option relate to the concept and situation?
4. **Apply commonsense**: Which choice makes the most logical sense?
5. **Final answer**: Select the best choice with reasoning.

Let me work through this systematically:"""

    elif prompt_type == "question_focused":
        # Alternative: Direct question analysis
        choices_text = "\n".join([f"{label}) {text}" for label, text in zip(sample['choices']['label'], sample['choices']['text'])])
        
        prompt = f"""Please solve this commonsense reasoning question step by step.

**Question:** {sample['question']}

**Choices:**
{choices_text}

**Analysis approach:**
- What is the main situation described?
- What relationship or outcome is being asked about?
- Which choice best fits the logical flow?
- What does common sense tell us?

**Step-by-step reasoning:"""

    elif prompt_type == "elimination":
        # Alternative: Elimination strategy
        choices_text = "\n".join([f"{label}) {text}" for label, text in zip(sample['choices']['label'], sample['choices']['text'])])
        
        prompt = f"""Let me solve this by systematically eliminating incorrect choices.

**Question:** {sample['question']}

**All Choices:**
{choices_text}

**Elimination process:**
1. **Identify clearly wrong choices**: Which options don't make sense at all?
2. **Remove unlikely options**: Which choices are possible but improbable?
3. **Compare remaining choices**: Among the viable options, which is best?
4. **Final verification**: Does my choice answer the question logically?

**Reasoning:"""

    else:  # default - simple approach
        choices_text = ", ".join([f"{label}: {text}" for label, text in zip(sample['choices']['label'], sample['choices']['text'])])
        prompt = f"Question: {sample['question']}\nChoices: {choices_text}\n\nLet me think step by step and choose the best answer."
    
    try:
        response = model.generate_content(prompt)
        trace = response.text
        
        # Extract A-E answer from CoT
        # CommonSenseQA answers are single letters (A, B, C, D, E)
        
        # Method 1: Look for explicit answer patterns
        answer_patterns = [
            r'(?:final\s+)?(?:answer|choice)[\s:]*(?:is\s+)?(?:\*\*)?([A-E])(?:\*\*)?',
            r'(?:the\s+)?answer[\s:]+(?:\*\*)?([A-E])(?:\*\*)?',
            r'(?:\*\*)?([A-E])(?:\*\*)?(?:\s*[\.!])?$',  # A-E at end of line
            r'(?:therefore|so|thus)[\s,]*(?:the\s+answer\s+is\s+)?(?:\*\*)?([A-E])(?:\*\*)?',
            r'(?:choose|select)[\s,]*(?:option\s+)?(?:\*\*)?([A-E])(?:\*\*)?',
            r'(?:option|choice)\s+(?:\*\*)?([A-E])(?:\*\*)?',
        ]
        
        extracted_answer = None
        for pattern in answer_patterns:
            matches = re.findall(pattern, trace, re.IGNORECASE)
            if matches:
                # Get the last match (most likely final answer)
                extracted_answer = matches[-1].upper()
                break
        
        # Method 2: Fallback - look for A-E in final lines
        if not extracted_answer:
            lines = trace.split('\n')
            for line in reversed(lines[-5:]):  # Check last 5 lines
                # Look for isolated A, B, C, D, E
                isolated_letters = re.findall(r'\b([A-E])\b', line.upper())
                if isolated_letters:
                    extracted_answer = isolated_letters[-1]  # Take the last one
                    break
        
        # Method 3: Final fallback - look for "A)" style patterns
        if not extracted_answer:
            answer_with_paren = re.findall(r'([A-E])\)', trace, re.IGNORECASE)
            if answer_with_paren:
                extracted_answer = answer_with_paren[-1].upper()
        
        return {
            "id": sample['id'],
            "question": sample['question'],
            "question_concept": sample['question_concept'],
            "choices": sample['choices'],
            "cot": trace,
            "ans": extracted_answer if extracted_answer else sample['answerKey'],  # Fallback to gold if extraction fails
            "gold": sample['answerKey'],
            "domain": "commonsenseqa",
            "prompt_type": prompt_type
        }
        
    except Exception as e:
        print(f"Error generating content for CommonSenseQA problem: {sample['id']}")
        print(f"Error details: {e}")
        return {
            "id": sample['id'],
            "question": sample['question'],
            "question_concept": sample['question_concept'],
            "choices": sample['choices'],
            "cot": f"Error: {e}",
            "ans": sample['answerKey'],
            "gold": sample['answerKey'],
            "domain": "commonsenseqa",
            "prompt_type": prompt_type
        }

print("‚úÖ COMMONSENSEQA CoT generation function created!")
print("\nüìã AVAILABLE PROMPT TYPES:")
print("1. 'concept_guided' (RECOMMENDED) - Systematic concept-based reasoning")
print("2. 'question_focused' - Direct question analysis approach")
print("3. 'elimination' - Process of elimination strategy")
print("4. 'default' - Simple approach")

print("\nüéØ RECOMMENDED CHOICE: 'concept_guided'")
print("   REASON: Leverages CommonSenseQA's concept field for better reasoning")
print("   FEATURES:")
print("   ‚Ä¢ 5-step systematic process")
print("   ‚Ä¢ Concept-first analysis")
print("   ‚Ä¢ Clear A-E choice presentation")
print("   ‚Ä¢ Commonsense application focus")

‚úÖ COMMONSENSEQA CoT generation function created!

üìã AVAILABLE PROMPT TYPES:
1. 'concept_guided' (RECOMMENDED) - Systematic concept-based reasoning
2. 'question_focused' - Direct question analysis approach
3. 'elimination' - Process of elimination strategy
4. 'default' - Simple approach

üéØ RECOMMENDED CHOICE: 'concept_guided'
   REASON: Leverages CommonSenseQA's concept field for better reasoning
   FEATURES:
   ‚Ä¢ 5-step systematic process
   ‚Ä¢ Concept-first analysis
   ‚Ä¢ Clear A-E choice presentation
   ‚Ä¢ Commonsense application focus


In [14]:
# COMMONSENSEQA DATASET PREPARATION AND CONFIGURATION
print("üîÑ PREPARING COMMONSENSEQA DATASET FOR COT GENERATION...")

# Prepare CommonSenseQA questions (use train split)
commonsenseqa_train = commonsenseqa_ds['train']
commonsenseqa_questions = []

for sample in commonsenseqa_train:
    commonsenseqa_questions.append({
        'id': sample['id'],
        'question': sample['question'],
        'question_concept': sample['question_concept'],
        'choices': sample['choices'],
        'answerKey': sample['answerKey']
    })

print(f"üìä CommonSenseQA Dataset prepared: {len(commonsenseqa_questions)} problems")

# Show sample of different concepts
concepts = [q['question_concept'] for q in commonsenseqa_questions[:50]]
unique_concepts = list(set(concepts))
print(f"   Sample concepts: {unique_concepts[:10]}...")

# COMMONSENSEQA GENERATION CONFIGURATION
COMMONSENSEQA_TOTAL_SAMPLES = 1000  # USER REQUESTED: 1000 samples
COMMONSENSEQA_CHECKPOINT_EVERY = 25  # Save progress every N samples
COMMONSENSEQA_CHECKPOINT_FILE = "cot_commonsenseqa_checkpoint.json"

# Load existing progress if any
commonsenseqa_dataset = []
commonsenseqa_start_index = 0

if os.path.exists(COMMONSENSEQA_CHECKPOINT_FILE):
    try:
        with open(COMMONSENSEQA_CHECKPOINT_FILE, 'r') as f:
            commonsenseqa_dataset = json.load(f)
        commonsenseqa_start_index = len(commonsenseqa_dataset)
        print(f"üìÅ FOUND CHECKPOINT: {commonsenseqa_start_index} samples already completed")
        print(f"üìä Progress: {commonsenseqa_start_index}/{COMMONSENSEQA_TOTAL_SAMPLES} ({commonsenseqa_start_index/COMMONSENSEQA_TOTAL_SAMPLES*100:.1f}%)")
        
        if commonsenseqa_start_index > 0:
            print(f"‚úÖ Resuming from sample #{commonsenseqa_start_index + 1}")
            print(f"üöÄ Remaining: {COMMONSENSEQA_TOTAL_SAMPLES - commonsenseqa_start_index} samples")
        
    except Exception as e:
        print(f"‚ùå Error loading CommonSenseQA checkpoint: {e}")
        print("üîÑ Starting fresh...")
        commonsenseqa_dataset = []
        commonsenseqa_start_index = 0
else:
    print("üìÑ No checkpoint found - starting from beginning")

print(f"\nüöÄ COMMONSENSEQA COT GENERATION STATUS:")
print(f"   ‚Ä¢ Total target: {COMMONSENSEQA_TOTAL_SAMPLES}")
print(f"   ‚Ä¢ Already completed: {commonsenseqa_start_index}")
print(f"   ‚Ä¢ Remaining: {COMMONSENSEQA_TOTAL_SAMPLES - commonsenseqa_start_index}")
print(f"   ‚Ä¢ Checkpoint every: {COMMONSENSEQA_CHECKPOINT_EVERY} samples")
print(f"   ‚Ä¢ Recommended prompt: 'concept_guided'")
print(f"   ‚Ä¢ Rate limit: 4 seconds between requests")
print(f"   ‚Ä¢ Estimated time: ~{((COMMONSENSEQA_TOTAL_SAMPLES - commonsenseqa_start_index) * 4) // 60} minutes")
print("="*60)

üîÑ PREPARING COMMONSENSEQA DATASET FOR COT GENERATION...
üìä CommonSenseQA Dataset prepared: 9741 problems
   Sample concepts: ['seafood restaurant', 'lying', 'distance', 'bench', 'choker', 'sun', 'skiing', 'fountain pen', 'run errands', 'watching tv']...
üìÑ No checkpoint found - starting from beginning

üöÄ COMMONSENSEQA COT GENERATION STATUS:
   ‚Ä¢ Total target: 1000
   ‚Ä¢ Already completed: 0
   ‚Ä¢ Remaining: 1000
   ‚Ä¢ Checkpoint every: 25 samples
   ‚Ä¢ Recommended prompt: 'concept_guided'
   ‚Ä¢ Rate limit: 4 seconds between requests
   ‚Ä¢ Estimated time: ~66 minutes


In [17]:
# COMMONSENSEQA COT GENERATION LOOP
if commonsenseqa_start_index >= COMMONSENSEQA_TOTAL_SAMPLES:
    print("üéâ CommonSenseQA CoT generation already completed! All samples generated.")
    
    # Create final file if it doesn't exist
    final_file = f"cot_commonsenseqa_final_{len(commonsenseqa_dataset)}.json"
    if not os.path.exists(final_file):
        with open(final_file, "w") as f:
            json.dump(commonsenseqa_dataset, f, indent=2)
        print(f"üèÅ Final dataset created: {final_file}")
    else:
        print(f"üìÅ Final file already exists: {final_file}")
else:
    remaining_samples = COMMONSENSEQA_TOTAL_SAMPLES - commonsenseqa_start_index
    print(f"üöÄ Generating {remaining_samples} remaining CommonSenseQA CoT samples...")
    print(f"üìç Starting from sample #{commonsenseqa_start_index + 1}")
    print(f"üíæ Checkpoints every {COMMONSENSEQA_CHECKPOINT_EVERY} samples")
    print("="*50)

    # Generate remaining samples
    for i, sample in enumerate(tqdm(commonsenseqa_questions[commonsenseqa_start_index:commonsenseqa_start_index + remaining_samples], 
                                  desc=f"Processing CommonSenseQA {commonsenseqa_start_index+1}-{COMMONSENSEQA_TOTAL_SAMPLES}", 
                                  unit="problem")):
        
        # Use concept_guided prompt for best reasoning with systematic approach
        entry = generate_cot_commonsenseqa(sample, "concept_guided")
        commonsenseqa_dataset.append(entry)
        
        # Checkpoint saving with Windows-compatible atomic write
        if len(commonsenseqa_dataset) % COMMONSENSEQA_CHECKPOINT_EVERY == 0:
            temp_file = f"temp_{COMMONSENSEQA_CHECKPOINT_FILE}"
            with open(temp_file, "w") as f:
                json.dump(commonsenseqa_dataset, f, indent=2)
            
            # Windows-compatible atomic write
            if os.path.exists(COMMONSENSEQA_CHECKPOINT_FILE):
                os.remove(COMMONSENSEQA_CHECKPOINT_FILE)
            os.rename(temp_file, COMMONSENSEQA_CHECKPOINT_FILE)
            print(f"üíæ CommonSenseQA Checkpoint: {len(commonsenseqa_dataset)}/{COMMONSENSEQA_TOTAL_SAMPLES} samples saved")
        
        time.sleep(4)  # Rate limit delay

    # Final save
    with open(COMMONSENSEQA_CHECKPOINT_FILE, "w") as f:
        json.dump(commonsenseqa_dataset, f, indent=2)
    
    print(f"‚úÖ CommonSenseQA Generation complete! {len(commonsenseqa_dataset)} samples saved to {COMMONSENSEQA_CHECKPOINT_FILE}")

    # Create final numbered file
    commonsenseqa_final_file = f"cot_commonsenseqa_final_{len(commonsenseqa_dataset)}.json"
    with open(commonsenseqa_final_file, "w") as f:
        json.dump(commonsenseqa_dataset, f, indent=2)
    print(f"üèÅ Final CommonSenseQA dataset: {commonsenseqa_final_file}")
    
    # Show sample of generated data and accuracy
    if commonsenseqa_dataset:
        print(f"\nüìã SAMPLE GENERATED COMMONSENSEQA COT:")
        sample_entry = commonsenseqa_dataset[0]
        print(f"   ID: {sample_entry['id']}")
        print(f"   Concept: {sample_entry['question_concept']}")
        print(f"   Question: {sample_entry['question'][:100]}...")
        print(f"   Generated Answer: {sample_entry['ans']}")
        print(f"   Gold Answer: {sample_entry['gold']}")
        print(f"   CoT Length: {len(sample_entry['cot'])} characters")
        
        # Check accuracy on all samples
        correct = sum(1 for entry in commonsenseqa_dataset if entry['ans'] == entry['gold'])
        accuracy = correct / len(commonsenseqa_dataset) * 100
        print(f"   üìä Final Accuracy: {accuracy:.1f}% ({correct}/{len(commonsenseqa_dataset)})")

print("="*60)

üöÄ Generating 1000 remaining CommonSenseQA CoT samples...
üìç Starting from sample #1
üíæ Checkpoints every 25 samples


Processing CommonSenseQA 1-1000:   2%|‚ñè         | 24/1000 [03:08<1:58:40,  7.30s/problem]

üíæ CommonSenseQA Checkpoint: 25/1000 samples saved


Processing CommonSenseQA 1-1000:   5%|‚ñç         | 49/1000 [06:22<1:49:43,  6.92s/problem]

üíæ CommonSenseQA Checkpoint: 50/1000 samples saved


Processing CommonSenseQA 1-1000:   7%|‚ñã         | 74/1000 [09:22<1:49:52,  7.12s/problem]

üíæ CommonSenseQA Checkpoint: 75/1000 samples saved


Processing CommonSenseQA 1-1000:  10%|‚ñâ         | 99/1000 [12:29<1:40:42,  6.71s/problem]

üíæ CommonSenseQA Checkpoint: 100/1000 samples saved


Processing CommonSenseQA 1-1000:  12%|‚ñà‚ñè        | 124/1000 [15:30<2:06:18,  8.65s/problem]

üíæ CommonSenseQA Checkpoint: 125/1000 samples saved


Processing CommonSenseQA 1-1000:  15%|‚ñà‚ñç        | 149/1000 [18:34<1:35:22,  6.72s/problem]

üíæ CommonSenseQA Checkpoint: 150/1000 samples saved


Processing CommonSenseQA 1-1000:  17%|‚ñà‚ñã        | 174/1000 [21:41<1:39:59,  7.26s/problem]

üíæ CommonSenseQA Checkpoint: 175/1000 samples saved


Processing CommonSenseQA 1-1000:  20%|‚ñà‚ñâ        | 199/1000 [24:43<1:37:47,  7.33s/problem]

üíæ CommonSenseQA Checkpoint: 200/1000 samples saved


Processing CommonSenseQA 1-1000:  22%|‚ñà‚ñà‚ñè       | 224/1000 [27:55<2:13:43, 10.34s/problem]

üíæ CommonSenseQA Checkpoint: 225/1000 samples saved


Processing CommonSenseQA 1-1000:  25%|‚ñà‚ñà‚ñç       | 249/1000 [31:03<1:45:51,  8.46s/problem]

üíæ CommonSenseQA Checkpoint: 250/1000 samples saved


Processing CommonSenseQA 1-1000:  27%|‚ñà‚ñà‚ñã       | 274/1000 [34:01<1:23:59,  6.94s/problem]

üíæ CommonSenseQA Checkpoint: 275/1000 samples saved


Processing CommonSenseQA 1-1000:  30%|‚ñà‚ñà‚ñâ       | 299/1000 [37:02<1:23:04,  7.11s/problem]

üíæ CommonSenseQA Checkpoint: 300/1000 samples saved


Processing CommonSenseQA 1-1000:  32%|‚ñà‚ñà‚ñà‚ñè      | 324/1000 [40:19<1:20:28,  7.14s/problem]

üíæ CommonSenseQA Checkpoint: 325/1000 samples saved


Processing CommonSenseQA 1-1000:  35%|‚ñà‚ñà‚ñà‚ñç      | 349/1000 [43:24<1:17:38,  7.16s/problem]

üíæ CommonSenseQA Checkpoint: 350/1000 samples saved


Processing CommonSenseQA 1-1000:  37%|‚ñà‚ñà‚ñà‚ñã      | 374/1000 [46:34<1:11:56,  6.90s/problem]

üíæ CommonSenseQA Checkpoint: 375/1000 samples saved


Processing CommonSenseQA 1-1000:  40%|‚ñà‚ñà‚ñà‚ñâ      | 399/1000 [49:30<1:11:44,  7.16s/problem]

üíæ CommonSenseQA Checkpoint: 400/1000 samples saved


Processing CommonSenseQA 1-1000:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 424/1000 [52:30<1:06:36,  6.94s/problem]

üíæ CommonSenseQA Checkpoint: 425/1000 samples saved


Processing CommonSenseQA 1-1000:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 449/1000 [55:37<1:11:31,  7.79s/problem]

üíæ CommonSenseQA Checkpoint: 450/1000 samples saved


Processing CommonSenseQA 1-1000:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 474/1000 [58:41<1:03:13,  7.21s/problem]

üíæ CommonSenseQA Checkpoint: 475/1000 samples saved


Processing CommonSenseQA 1-1000:  50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 499/1000 [1:02:09<1:00:04,  7.19s/problem]

üíæ CommonSenseQA Checkpoint: 500/1000 samples saved


Processing CommonSenseQA 1-1000:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 524/1000 [1:05:12<58:46,  7.41s/problem]  

üíæ CommonSenseQA Checkpoint: 525/1000 samples saved


Processing CommonSenseQA 1-1000:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 549/1000 [1:08:18<53:42,  7.15s/problem]  

üíæ CommonSenseQA Checkpoint: 550/1000 samples saved


Processing CommonSenseQA 1-1000:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 574/1000 [1:11:24<50:02,  7.05s/problem]  

üíæ CommonSenseQA Checkpoint: 575/1000 samples saved


Processing CommonSenseQA 1-1000:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 599/1000 [1:14:24<44:55,  6.72s/problem]  

üíæ CommonSenseQA Checkpoint: 600/1000 samples saved


Processing CommonSenseQA 1-1000:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 624/1000 [1:17:41<48:34,  7.75s/problem]  

üíæ CommonSenseQA Checkpoint: 625/1000 samples saved


Processing CommonSenseQA 1-1000:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 649/1000 [1:20:54<41:56,  7.17s/problem]

üíæ CommonSenseQA Checkpoint: 650/1000 samples saved


Processing CommonSenseQA 1-1000:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 674/1000 [1:23:47<37:09,  6.84s/problem]

üíæ CommonSenseQA Checkpoint: 675/1000 samples saved


Processing CommonSenseQA 1-1000:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 699/1000 [1:26:54<39:34,  7.89s/problem]

üíæ CommonSenseQA Checkpoint: 700/1000 samples saved


Processing CommonSenseQA 1-1000:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 724/1000 [1:29:55<32:35,  7.09s/problem]

üíæ CommonSenseQA Checkpoint: 725/1000 samples saved


Processing CommonSenseQA 1-1000:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 749/1000 [1:32:47<27:57,  6.68s/problem]

üíæ CommonSenseQA Checkpoint: 750/1000 samples saved


Processing CommonSenseQA 1-1000:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 774/1000 [1:35:43<25:58,  6.89s/problem]

üíæ CommonSenseQA Checkpoint: 775/1000 samples saved


Processing CommonSenseQA 1-1000:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 799/1000 [1:38:53<23:55,  7.14s/problem]

üíæ CommonSenseQA Checkpoint: 800/1000 samples saved


Processing CommonSenseQA 1-1000:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 824/1000 [1:41:46<20:48,  7.10s/problem]

üíæ CommonSenseQA Checkpoint: 825/1000 samples saved


Processing CommonSenseQA 1-1000:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 849/1000 [1:44:53<17:48,  7.08s/problem]

üíæ CommonSenseQA Checkpoint: 850/1000 samples saved


Processing CommonSenseQA 1-1000:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 874/1000 [1:47:53<14:04,  6.70s/problem]

üíæ CommonSenseQA Checkpoint: 875/1000 samples saved


Processing CommonSenseQA 1-1000:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 899/1000 [1:51:03<13:04,  7.77s/problem]

üíæ CommonSenseQA Checkpoint: 900/1000 samples saved


Processing CommonSenseQA 1-1000:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 924/1000 [1:54:04<10:14,  8.08s/problem]

üíæ CommonSenseQA Checkpoint: 925/1000 samples saved


Processing CommonSenseQA 1-1000:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 949/1000 [1:56:58<05:34,  6.56s/problem]

üíæ CommonSenseQA Checkpoint: 950/1000 samples saved


Processing CommonSenseQA 1-1000:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 974/1000 [2:00:02<03:34,  8.25s/problem]

üíæ CommonSenseQA Checkpoint: 975/1000 samples saved


Processing CommonSenseQA 1-1000: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 999/1000 [2:03:23<00:07,  7.19s/problem]

üíæ CommonSenseQA Checkpoint: 1000/1000 samples saved


Processing CommonSenseQA 1-1000: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [2:03:33<00:00,  7.41s/problem]

‚úÖ CommonSenseQA Generation complete! 1000 samples saved to cot_commonsenseqa_checkpoint.json
üèÅ Final CommonSenseQA dataset: cot_commonsenseqa_final_1000.json

üìã SAMPLE GENERATED COMMONSENSEQA COT:
   ID: 075e483d21c29a511267ef62bedc0461
   Concept: punishing
   Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the scho...
   Generated Answer: E
   Gold Answer: A
   CoT Length: 3237 characters
   üìä Final Accuracy: 64.7% (647/1000)





In [18]:
# ADD 'is_correct' COLUMN TO COMMONSENSEQA DATASET
print("üîß ADDING 'is_correct' COLUMN TO COMMONSENSEQA DATASET")
print("="*60)

import json
import os

# File paths
input_file = "cot_commonsenseqa_final_1000.json"
output_file = "cot_commonsenseqa_final_1000_with_correct.json"

if os.path.exists(input_file):
    # Load the dataset
    with open(input_file, 'r') as f:
        commonsenseqa_data = json.load(f)
    
    print(f"üìä Loaded {len(commonsenseqa_data)} samples from {input_file}")
    
    # Add 'is_correct' column
    correct_count = 0
    for entry in commonsenseqa_data:
        # Compare generated answer with gold answer
        is_correct = entry['ans'] == entry['gold']
        entry['is_correct'] = is_correct
        
        if is_correct:
            correct_count += 1
    
    # Save the updated dataset
    with open(output_file, 'w') as f:
        json.dump(commonsenseqa_data, f, indent=2)
    
    # Calculate and display statistics
    total_samples = len(commonsenseqa_data)
    accuracy = (correct_count / total_samples) * 100
    
    print(f"‚úÖ Successfully added 'is_correct' column!")
    print(f"üìà Dataset Statistics:")
    print(f"   ‚Ä¢ Total samples: {total_samples}")
    print(f"   ‚Ä¢ Correct answers: {correct_count}")
    print(f"   ‚Ä¢ Incorrect answers: {total_samples - correct_count}")
    print(f"   ‚Ä¢ Accuracy: {accuracy:.2f}%")
    print(f"üìÅ Updated dataset saved as: {output_file}")
    
    # Show sample entries with the new column
    print(f"\nüìã SAMPLE ENTRIES WITH 'is_correct' COLUMN:")
    for i, entry in enumerate(commonsenseqa_data[:3]):
        print(f"\nSample {i+1}:")
        print(f"   ID: {entry['id']}")
        print(f"   Concept: {entry['question_concept']}")
        print(f"   Question: {entry['question'][:80]}...")
        print(f"   Generated Answer: {entry['ans']}")
        print(f"   Gold Answer: {entry['gold']}")
        print(f"   Is Correct: {entry['is_correct']}")
    
    # Also update the original file (backup approach)
    backup_file = "cot_commonsenseqa_final_1000_backup.json"
    if not os.path.exists(backup_file):
        # Create backup of original
        with open(input_file, 'r') as f:
            original_data = json.load(f)
        with open(backup_file, 'w') as f:
            json.dump(original_data, f, indent=2)
        print(f"üíæ Backup created: {backup_file}")
    
    # Overwrite original file with updated data
    with open(input_file, 'w') as f:
        json.dump(commonsenseqa_data, f, indent=2)
    print(f"üîÑ Original file updated: {input_file}")
    
    # Show answer distribution analysis
    print(f"\nüìä ANSWER DISTRIBUTION ANALYSIS:")
    answer_dist = {}
    for entry in commonsenseqa_data:
        ans = entry['ans']
        answer_dist[ans] = answer_dist.get(ans, 0) + 1
    print(f"   Generated answers: {answer_dist}")
    
    # Show accuracy by concept (sample analysis)
    concept_accuracy = {}
    for entry in commonsenseqa_data[:100]:  # Sample first 100 for concept analysis
        concept = entry['question_concept']
        if concept not in concept_accuracy:
            concept_accuracy[concept] = {'correct': 0, 'total': 0}
        concept_accuracy[concept]['total'] += 1
        if entry['is_correct']:
            concept_accuracy[concept]['correct'] += 1
    
    print(f"\nüéØ CONCEPT ACCURACY ANALYSIS (First 100 samples):")
    for concept, stats in list(concept_accuracy.items())[:5]:
        acc = (stats['correct'] / stats['total']) * 100
        print(f"   {concept}: {acc:.1f}% ({stats['correct']}/{stats['total']})")
    
else:
    print(f"‚ùå File not found: {input_file}")
    print("   Make sure the CommonSenseQA final dataset file exists in the current directory")

print("="*60)

üîß ADDING 'is_correct' COLUMN TO COMMONSENSEQA DATASET
üìä Loaded 1000 samples from cot_commonsenseqa_final_1000.json
‚úÖ Successfully added 'is_correct' column!
üìà Dataset Statistics:
   ‚Ä¢ Total samples: 1000
   ‚Ä¢ Correct answers: 647
   ‚Ä¢ Incorrect answers: 353
   ‚Ä¢ Accuracy: 64.70%
üìÅ Updated dataset saved as: cot_commonsenseqa_final_1000_with_correct.json

üìã SAMPLE ENTRIES WITH 'is_correct' COLUMN:

Sample 1:
   ID: 075e483d21c29a511267ef62bedc0461
   Concept: punishing
   Question: The sanctions against the school were a punishing blow, and they seemed to what ...
   Generated Answer: E
   Gold Answer: A
   Is Correct: False

Sample 2:
   ID: 61fe6e879ff18686d7552425a36344c8
   Concept: people
   Question: Sammy wanted to go to where the people were.  Where might he go?...
   Generated Answer: B
   Gold Answer: B
   Is Correct: True

Sample 3:
   ID: 4c1cb0e95b99f72d55c068ba0255c54d
   Concept: choker
   Question: To locate a choker not located in a jewelry box o