# Synthetic Data Generation - FREE (Hugging Face)

**100% Free - No Credit Card Needed**

1. Get free API token from huggingface.co/settings/tokens
2. Upload scored_data.json
3. Run cells

In [None]:
!pip install -q huggingface_hub tqdm

import os, json, time
from tqdm import tqdm
from huggingface_hub import InferenceClient

print("✅ Ready")

In [None]:
# API Token (FREE from huggingface.co/settings/tokens)
print("Enter your Hugging Face token:")
HF_TOKEN = input("Token: ").strip()

client = InferenceClient(token=HF_TOKEN)

# Find data
INPUT_FILE = None
for p in ["/kaggle/input/gricebench-scored/scored_data.json", "/kaggle/input/scored-data/scored_data.json"]:
    if os.path.exists(p): INPUT_FILE = p; break

if not INPUT_FILE: raise FileNotFoundError("Upload scored_data.json!")

OUTPUT_FILE = "/kaggle/working/synthetic_candidates.json"
print(f"✅ Input: {INPUT_FILE}")

In [None]:
# System Prompt
SYSTEM = """You are a Gricean Cooperative Assistant.
Generate responses that strictly adhere to all four Gricean Maxims:
1. Quantity: Be as informative as required, but no more.
2. Quality: Do not say what you believe to be false.
3. Relation: Be strictly relevant.
4. Manner: Be clear, concise, and polite.

Answer the prompt directly and cooperatively. Do not be chatty."""

def get_failed(path):
    with open(path) as f: data = json.load(f)
    return [e for e in data if not all(e.get('margins',{}).get(m,0)>0 for m in ['quantity','quality','relation','manner'])]

def run():
    all_cands = get_failed(INPUT_FILE)
    print(f"Target: {len(all_cands)}")
    
    done = []
    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE) as f: done = json.load(f)
            print(f"Resume: {len(done)} done")
        except: pass
    
    done_prompts = {d['prompt'] for d in done}
    todos = [c for c in all_cands if c['prompt'] not in done_prompts]
    print(f"Remaining: {len(todos)}\nStarting...")

    try:
        for i, item in enumerate(tqdm(todos)):
            try:
                # Call HF Inference API (FREE)
                messages = [
                    {"role": "system", "content": SYSTEM},
                    {"role": "user", "content": item['prompt']}
                ]
                
                response = client.chat_completion(
                    messages=messages,
                    model="meta-llama/Llama-3.3-70B-Instruct",  # FREE model
                    max_tokens=1024,
                    temperature=0.7
                )
                
                text = response.choices[0].message.content.strip()
                
                res = item.copy()
                res['synthetic_chosen'] = text
                res['original_chosen_failed'] = item['chosen']
                res['chosen'] = text
                done.append(res)
                
                if (i+1) % 10 == 0:
                    with open(OUTPUT_FILE, 'w') as f: json.dump(done, f, indent=2)
                
                time.sleep(1.0)  # HF free tier rate limit
                
            except Exception as e:
                print(f"\nErr {i}: {e}")
                time.sleep(30)
                
    except KeyboardInterrupt:
        print("\nStopped")
    finally:
        with open(OUTPUT_FILE, 'w') as f: json.dump(done, f, indent=2)
        print(f"\n✅ {len(done)} saved")

run()