# Synthetic Data Generation with Gemini

**Instructions:**
1. **Upload Data:** Create a new dataset with `scored_data.json` and add it to this notebook.
2. **API Key:** Get your Gemini API key from Google AI Studio.
3. **Run All:** Run the cells below. The script will save progress automatically.
4. **Download:** When finished (or stopped), download `synthetic_candidates.json`.

In [None]:
# Install Gemini SDK
!pip install -q -U google-generativeai tqdm

import os
import json
import time
import tqdm
import google.generativeai as genai
from google.colab import userdata # Try colab/kaggle secrets if available, else manual input

print("Libraries installed.")

In [None]:
# ============================
# CONFIGURATION
# ============================

# 1. API KEY SETUP
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
except:
    API_KEY = input("Enter your Gemini API Key: ").strip()

genai.configure(api_key=API_KEY)

# 2. DATA LOAD
# Look for the file in common locations
POSSIBLE_PATHS = [
    "/kaggle/input/scored-data/scored_data.json",
    "/kaggle/input/gricebench-scored/scored_data.json",
    "scored_data.json"
]
INPUT_FILE = next((p for p in POSSIBLE_PATHS if os.path.exists(p)), None)

if not INPUT_FILE:
    print("❌ Error: scored_data.json not found!")
    print("Please upload it as a dataset and add it to the notebook.")
    # Stop execution if possible, or raise error
else:
    print(f"✅ Found input file: {INPUT_FILE}")

# 3. OUTPUT SETUP
OUTPUT_FILE = "/kaggle/working/synthetic_candidates.json"

In [None]:
# ============================
# GENERATION LOGIC
# ============================

# Strict Gricean System Prompt
SYSTEM_INSTRUCTION = """You are a Gricean Cooperative Assistant.
Your task is to generate responses that strictly adhere to all four Gricean Maxims:
1. Quantity: Be as informative as required, but no more.
2. Quality: Do not say what you believe to be false or lack evidence for.
3. Relation: Be strictly relevant to the user's prompt.
4. Manner: Be perspicuous—avoid obscurity, ambiguity, and unnecessary verbosity. Be orderly and polite.

Context: You are providing a 'chosen' response for a DPO dataset.
Your output must be significantly better than a typical chatbot response in terms of cooperation and clarity.
Do not be chatty. Do not offer unsolicited advice. Answer the prompt directly and cooperatively."""

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config={
        "temperature": 0.7,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 1024,
        "response_mime_type": "text/plain",
    },
    system_instruction=SYSTEM_INSTRUCTION
)

def get_failed_prompts(data_path):
    with open(data_path, 'r') as f: data = json.load(f)
    candidates = []
    for entry in data:
        m = entry.get('margins', {})
        # Select if ANY margin is non-positive
        if not (m.get('quantity',0)>0 and m.get('quality',0)>0 and m.get('relation',0)>0 and m.get('manner',0)>0):
            candidates.append(entry)
    return candidates

def run_generation():
    if not INPUT_FILE: return
    
    # Load targets
    all_candidates = get_failed_prompts(INPUT_FILE)
    print(f"Target prompts to generate: {len(all_candidates)}")
    
    # Resume capability
    completed = []
    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE, 'r') as f: completed = json.load(f)
            print(f"Resuming: {len(completed)} already done.")
        except: pass
    
    completed_prompts = {c['prompt'] for c in completed}
    todos = [c for c in all_candidates if c['prompt'] not in completed_prompts]
    print(f"Remaining: {len(todos)}")
    
    # Loop
    print("Starting generation... (Autosaving every 10 items)")
    print("Stop anytime by interrupting the cell. Data is saved.")

    try:
        count = 0
        for i, item in enumerate(tqdm.tqdm(todos)):
            try:
                # Generate
                response = model.generate_content(item['prompt'])
                text = response.text.strip()
                
                # Save result
                res_entry = item.copy()
                res_entry['synthetic_chosen'] = text
                res_entry['original_chosen_failed'] = item['chosen']
                res_entry['chosen'] = text # New chosen
                
                completed.append(res_entry)
                count += 1
                
                # Autosave
                if count % 10 == 0:
                    with open(OUTPUT_FILE, 'w') as f: json.dump(completed, f, indent=2)
                
                # Rate limit sleep (adjust based on your tier)
                time.sleep(2.0) 
                
            except Exception as e:
                print(f"Error on item {i}: {e}")
                time.sleep(10)
                
    except KeyboardInterrupt:
        print("\nStopped by user.")
    finally:
        with open(OUTPUT_FILE, 'w') as f: json.dump(completed, f, indent=2)
        print(f"\n✅ Saved {len(completed)} items to {OUTPUT_FILE}")

run_generation()
