<a href="https://colab.research.google.com/github/Rujjul/Synthetic_Dataset_Generator/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Synthetic_Dataset_Generator

In [None]:
!pip install -q -U bitsandbytes accelerate

In [None]:
#imports
import os
import requests
from IPython.display import Markdown, display, update_display
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import gradio as gr
import json

In [None]:
#LLM Model

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
#Quant config

quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)

In [None]:
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

In [None]:
system_message = """
You are a helpful assistant who generates synthetic data based on the user's demand and scenario. The user will provide you with the number of rows, the column names with their respective data types, and some other parameters or constraints if necessary. If you are not able to provide any data regarding something just directly mention that you are unable to do so. Tehn the user will either update or modify their prompt. Do not provide a python script to generate the data. Provide the data as a json with arrays.
"""

In [None]:
#JSON Extraction Function

def extract_json(text):
    try:
        data = json.loads(text)
        return (
            f"âœ“ Successfully generated {len(data)} rows of data",
            json.dumps(data, indent=2),
            data
        )
    except Exception as e:
        return f"âš  JSON parsing failed: {str(e)}", text, None


In [None]:
#Dataset Generation

def generate_dataset(num_rows, columns_info, additional_constraints):

    user_prompt = f"""
Create a synthetic dataset.

Number of rows: {num_rows}

Columns and constraints:
{columns_info}

Additional constraints:
{additional_constraints}

Notes:
- Name: max 13 alphabets
- Phone Number: exactly 10 digits
- Age: max 2 digits
- Occupation: max 30 alphabets
- PAN ID: exactly 10 alphanumeric characters (ABCDE1234F format)
Return ONLY JSON.
"""

    messages = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": user_prompt}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        inputs,
        max_new_tokens=1500,
        do_sample=True,
        temperature=0.7
    )

    decoded = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

    assistant_response = decoded.split("assistant")[-1].strip()

    return extract_json(assistant_response)


In [None]:
with gr.Blocks(title="Synthetic Dataset Generator") as demo:

    gr.Markdown("""
    # ðŸŽ² Synthetic Dataset Generator
    ### Powered by Llama 3.1 (4-bit quantized)
    Generate custom synthetic datasets by specifying columns and constraints
    """)

    with gr.Row():
        with gr.Column():
            num_rows = gr.Slider(
                minimum=1,
                maximum=100,
                value=5,
                step=1,
                label="Number of Rows"
            )

            columns_info = gr.Textbox(
                label="Column Specifications",
                placeholder="Example:\n- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits",
                lines=8,
                value="- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits\n- Occupation: max 30 alphabets\n- PAN ID: 12 alphanumeric characters"
            )

            additional_constraints = gr.Textbox(
                label="Additional Constraints (Optional)",
                placeholder="Example: All names should be Indian names, Age between 18-65",
                lines=3
            )

            generate_btn = gr.Button("ðŸš€ Generate Dataset", variant="primary", size="lg")

        with gr.Column():
            status_text = gr.Textbox(
                label="Status",
                lines=2
            )

            output_json = gr.Code(
                label="Generated Dataset (JSON)",
                language="json",
                lines=15
            )

            output_data = gr.JSON(
                label="Preview",
                visible=True
            )

    # Examples
    gr.Markdown("### ðŸ“‹ Example Templates")
    gr.Examples(
        examples=[
            [5, "- Name: max 13 alphabets\n- Phone Number: 10 digits\n- Age: max 2 digits\n- Occupation: max 30 alphabets\n- PAN ID: 12 alphanumeric characters", "All Indian names"],
            [10, "- Product Name: max 20 characters\n- Price: between $10-$1000\n- Category: Electronics/Clothing/Food\n- Rating: 1-5 stars\n- Stock: 0-500 units", ""],
            [8, "- Customer Name: full name\n- Email: valid email format\n- Purchase Date: YYYY-MM-DD format\n- Amount: $50-$5000\n- Payment Method: Credit/Debit/PayPal", "Dates in 2024"],
        ],
        inputs=[num_rows, columns_info, additional_constraints]
    )

    # Connect button
    generate_btn.click(
        fn=generate_dataset,
        inputs=[num_rows, columns_info, additional_constraints],
        outputs=[status_text, output_json, output_data]
    )


In [None]:
print("\nLaunching Gradio interface...")
demo.launch(share=True, debug=True)