<a href="https://colab.research.google.com/github/Rujjul/Synthetic_Dataset_Generator/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Synthetic_Dataset_Generator

In [None]:
!pip install -q -U bitsandbytes accelerate

In [None]:
#imports
import os
import requests
from IPython.display import Markdown, display, update_display
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import gradio as gr
import json

In [None]:
#LLM Model

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
#Quant config

quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)

In [None]:
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

In [None]:
system_message = """
You are a helpful assistant that generates synthetic datasets.

IMPORTANT FORMAT RULES:
- Always return data as an array of objects (row format)
- Each object represents one complete row
- Always correlate the age with the person's occupation if given to generate.
- Example: [{"name": "John", "age": 25}, {"name": "Jane", "age": 30}]
- Never use columnar format like {"name": ["John", "Jane"], "age": [25, 30]}

If you cannot generate the requested data, state that clearly without providing alternatives.
Do NOT provide Python code - only return the JSON data directly.
"""

In [None]:
#JSON Extraction Function

def extract_json(text):
    try:
        data = json.loads(text)
        return (
            f"âœ“ Successfully generated {len(data)} rows of data",
            json.dumps(data, indent=2),
            data
        )
    except Exception as e:
        return f"âš  JSON parsing failed: {str(e)}", text, None


In [None]:
#Dataset Generation

def generate_dataset(num_rows, columns_info, additional_constraints):
#user prompt will take input from the gradio ui

    user_prompt = f"""
Create a synthetic dataset.

Number of rows: {num_rows}

Columns and constraints:
{columns_info}

Additional constraints:
{additional_constraints}


CRITICAL: You MUST include ALL of these columns in EVERY row:
Return data as an array of objects (row format), NOT columnar format.

Correct format example:
[
  {{"Name": "John Doe", "Phone Number": "1234567890", "Age": "25"}},
  {{"Name": "Jane Smith", "Phone Number": "9876543210", "Age": "31"}}
]

WRONG format (do NOT use):
{{"Name": ["John", "Jane"], "Phone Number": ["123", "456"]}}

Return ONLY the JSON array with no markdown, no code blocks, no explanations.
"""

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        return_attention_mask=True
    ).to(model.device)

    outputs = model.generate(
        inputs,
        max_new_tokens=2500,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=None
    )

    decoded = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

    assistant_response = decoded.split("assistant")[-1].strip()

    return extract_json(assistant_response)


In [None]:
with gr.Blocks(title="Synthetic Dataset Generator") as demo:

    gr.Markdown("""
    # ðŸŽ² Synthetic Dataset Generator
    ### Powered by Llama 3.1 (4-bit quantized)
    Generate custom synthetic datasets by specifying columns and constraints
    """)

    with gr.Row():
        with gr.Column():
            num_rows = gr.Slider(
                minimum=1,
                maximum=100,
                value=5,
                step=1,
                label="Number of Rows"
            )

            columns_info = gr.Textbox(
                label="Column Specifications",
                placeholder="Example:\n- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits",
                lines=8,
                value="- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits\n- Occupation: max 30 alphabets"
            )

            additional_constraints = gr.Textbox(
                label="Additional Constraints (Optional)",
                placeholder="Example: All names should be Indian names, Age between 18-65",
                lines=3
            )

            generate_btn = gr.Button("ðŸš€ Generate Dataset", variant="primary", size="lg")

        with gr.Column():
            status_text = gr.Textbox(
                label="Status",
                lines=2
            )

            output_json = gr.Code(
                label="Generated Dataset (JSON)",
                language="json",
                lines=15
            )

            output_data = gr.JSON(
                label="Preview",
                visible=True
            )

    # Examples
    gr.Markdown("### ðŸ“‹ Example Templates")
    gr.Examples(
        examples=[
            [5, "- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits\n- Occupation: max 30 alphabets", "All Indian names"],
            [10, "- Product Name: max 20 characters\n- Price: between $10-$1000\n- Category: Electronics/Clothing/Food\n- Rating: 1-5 stars\n- Stock: 0-500 units", ""],
            [8, "- Customer Name: full name\n- Email: valid email format\n- Purchase Date: YYYY-MM-DD format\n- Amount: $50-$5000\n- Payment Method: Credit/Debit/PayPal", "Dates in 2024"],
        ],
        inputs=[num_rows, columns_info, additional_constraints]
    )

    # Connect button
    generate_btn.click(
        fn=generate_dataset,
        inputs=[num_rows, columns_info, additional_constraints],
        outputs=[status_text, output_json, output_data]
    )


In [None]:
print("\nLaunching Gradio interface...")
demo.launch(share=True, debug=True)