# Synthetic Data Generation with Grok API

**Instructions:**
1. Upload `scored_data.json` as a dataset
2. Get your Grok API key from console.x.ai
3. Run the cells below
4. Download `synthetic_candidates.json` when done

In [None]:
# Install OpenAI SDK (Grok is compatible)
!pip install -q openai tqdm

import os
import json
import time
from tqdm import tqdm
from openai import OpenAI

print("✅ Libraries installed")

In [None]:
# ============================
# CONFIGURATION
# ============================

# API Key Input
print("Enter your Grok API Key from console.x.ai:")
API_KEY = input("API Key: ").strip()

if len(API_KEY) < 10:
    raise ValueError("Invalid API key!")

# Initialize Grok client (OpenAI-compatible)
client = OpenAI(
    api_key=API_KEY,
    base_url="https://api.x.ai/v1"
)

print("✅ Grok client configured")

# Find input data
POSSIBLE_PATHS = [
    "/kaggle/input/gricebench-scored/scored_data.json",
    "/kaggle/input/scored-data/scored_data.json",
    "scored_data.json"
]
INPUT_FILE = next((p for p in POSSIBLE_PATHS if os.path.exists(p)), None)

if not INPUT_FILE:
    raise FileNotFoundError("scored_data.json not found! Upload it as a dataset.")

print(f"✅ Found: {INPUT_FILE}")
OUTPUT_FILE = "/kaggle/working/synthetic_candidates.json"

In [None]:
# ============================
# GENERATION LOGIC
# ============================

# Strict Gricean System Prompt
SYSTEM_INSTRUCTION = """You are a Gricean Cooperative Assistant.
Your task is to generate responses that strictly adhere to all four Gricean Maxims:
1. Quantity: Be as informative as required, but no more.
2. Quality: Do not say what you believe to be false or lack evidence for.
3. Relation: Be strictly relevant to the user's prompt.
4. Manner: Be perspicuous—avoid obscurity, ambiguity, and unnecessary verbosity. Be orderly and polite.

Context: You are providing a 'chosen' response for a DPO dataset.
Your output must be significantly better than a typical chatbot response in terms of cooperation and clarity.
Do not be chatty. Do not offer unsolicited advice. Answer the prompt directly and cooperatively."""

def get_failed_prompts(data_path):
    with open(data_path, 'r') as f: data = json.load(f)
    candidates = []
    for entry in data:
        m = entry.get('margins', {})
        if not (m.get('quantity',0)>0 and m.get('quality',0)>0 and m.get('relation',0)>0 and m.get('manner',0)>0):
            candidates.append(entry)
    return candidates

def run_generation():
    all_candidates = get_failed_prompts(INPUT_FILE)
    print(f"Target prompts: {len(all_candidates)}")
    
    # Resume capability
    completed = []
    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE, 'r') as f: completed = json.load(f)
            print(f"Resuming: {len(completed)} done")
        except: pass
    
    completed_prompts = {c['prompt'] for c in completed}
    todos = [c for c in all_candidates if c['prompt'] not in completed_prompts]
    print(f"Remaining: {len(todos)}")
    print("Starting generation...")

    try:
        count = 0
        for i, item in enumerate(tqdm(todos)):
            try:
                # Call Grok API
                response = client.chat.completions.create(
                    model="grok-2-1212",  # Latest Grok model
                    messages=[
                        {"role": "system", "content": SYSTEM_INSTRUCTION},
                        {"role": "user", "content": item['prompt']}
                    ],
                    temperature=0.7,
                    max_tokens=1024
                )
                
                text = response.choices[0].message.content.strip()
                
                # Save result
                res_entry = item.copy()
                res_entry['synthetic_chosen'] = text
                res_entry['original_chosen_failed'] = item['chosen']
                res_entry['chosen'] = text
                
                completed.append(res_entry)
                count += 1
                
                # Autosave every 10
                if count % 10 == 0:
                    with open(OUTPUT_FILE, 'w') as f: json.dump(completed, f, indent=2)
                
                # Rate limit: Grok free tier ~15 RPM
                time.sleep(4.5)
                
            except Exception as e:
                print(f"\nError {i}: {e}")
                time.sleep(30)  # Longer backoff for quota errors
                
    except KeyboardInterrupt:
        print("\nStopped.")
    finally:
        with open(OUTPUT_FILE, 'w') as f: json.dump(completed, f, indent=2)
        print(f"\n✅ {len(completed)} items saved to {OUTPUT_FILE}")

run_generation()