# Synthetic Data Generator using various Hugging Face open-source models and  frontier models on google colab to generate a business dataset. Users can optionally provide a sample schema.

Utilizes 
- Quantization to shrink model size and lower gpu memory usage
- Hugging Face Open Source models called with the tokenizer and parsed to only return the assistant response
- JSONL parsing
- Use of the Panda library for data analysis manipulation and grid preview
- File generation for multiple types
- Error handling
- Gradio Blocks with multiple inferface objects

In [None]:
# Install dependencies
!pip install -q --upgrade bitsandbytes anthropic accelerate transformers==4.57.6

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import json
import anthropic
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import pandas as pd
import gradio as gr
import gc


In [None]:
# Define API Keys

hf_token = userdata.get('HF_TOKEN')
openai_api_key = userdata.get('OPENAI_API_KEY')
#anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')
#google_api_key = userdata.get('GOOGLE_API_KEY')

# Sign in to HuggingFace Hub
login(hf_token, add_to_git_credential=True)


In [None]:
# Quantization

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Constants

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
GPT = "gpt-4o-mini"
CLAUDE = "claude-3-haiku-20240307"
GEMINI = "gemini-2.0-flash"

In [None]:
# Models

MODELS = {
    'LLama 3.2' : LLAMA,
    'Phi 3 mini': PHI3,
    'Gemma 2': GEMMA2,
    'GPT 4.o mini': GPT,
    'Claude 3 Haiku': CLAUDE,
    'Gemini 2.0 Flash': GEMINI,
}

HF_MODELS = [LLAMA, PHI3, GEMMA2]

In [None]:
# File Formats

FILE_FORMATS = [".csv", ".tsv", ".jsonl", ".json"]

In [None]:
SCHEMA = [
    ("Name", "TEXT", "Name of Toy", "He-Man Fused with Skeletor"),
    ("Toy_Store", "TEXT", "Name of Toy Store", "Victory Toy Store"),
    ("Address", "TEXT", "Toy Store address", "432 Brook Ave, Brooklyn, NY 11345"),
    ("Type", "TEXT", "Toy Type", 'One of ["Action Figure","Doll","Video game"," Gaming Console","Learning"] or other potential types'),
    ("Price", "TEXT", "Toy Price", "$45, or '--' if unknown"),
    ("Year", "INT", "Year toy distributed", 2015),
    ("Units_Sold", "INT", "Number of Units Solde", 300),
    ("Locations", "Array", "Other Locations Sold at", '["123 Haven Rd, Newark, NJ 28563", "321 Scottsdale Drive, Orlando, FL 51943, ...]'),
]

DEFAULT_SCHEMA_TEXT = "\n".join([f"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}" for i, col in enumerate(SCHEMA)])

In [None]:
# Prompts

system_prompt = """
You are a expert assistant in generating synthetic datasets for businesses tailored to a given business case and user requirements.
You accept a business case of the data and the use case for the dataset.
If the user does not specify the output columns, infer and create the most appropriate columns based on your expertise.
Do not repeat column names or column values across rows and only out valid JSONL.
"""

def get_user_prompt(business_case, num_records, schema):
  user_prompt = f"Create a sample dataset for my business case described here: {business_case}. Generate {num_records} for the dataset should in valid JSONL format based on the business case."

  if schema is not None:
    user_prompt += f"Each line should be a JSON object with the following fields: \n{schema}\n"

  return user_prompt

# Define LLM functions

In [11]:
# GPT

def call_gpt(model, user_prompt):
  openai = OpenAI(api_key=openai_api_key)
  messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt }]
  response= openai.chat.completions.create(
      model=model,
      messages=messages,
      temperature=0.7,
    )

  content = response.choices[0].message.content
  return content


In [12]:
# Claude

def call_claude(model, user_prompt):
  claude = anthropic.Anthropic(api_key=anthropic_api_key)
  response = claude.messages.create(
      model = model,
      messages = [{"role": "user", "content" : user_prompt}],
      temperature = 0.7,
      max_tokens = 3000,
      system = system_prompt
    )

  content = response.content[0].text
  return content

In [13]:
# Gemini

def call_gemini(model, user_prompt):
  gemini = OpenAI(api_key=google_api_key, base_url = "https://generativelanguage.googleapis.com/v1beta/openai/")
  messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt }]
  response = gemini.chat.completions.create(
      model=model,
      messages=messages,
      temperature=0.7,
    )

  content = response.choices[0].message.content
  return content

In [14]:
# Hugging Face

def call_hf(model, user_prompt):
  messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt }]
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)
  outputs = model.generate(inputs, max_new_tokens=2000)

  _, _, after = tokenizer.decode(outputs[0]).partition("assistant<|end_header_id|>")
  content = after.strip()
  return content

In [15]:
# Choose LLM

def choose_LLM(model_name, user_prompt):
  try:
    model = MODELS[model_name]

    if "gpt" in model.lower():
      response = call_gpt(model, user_prompt)
    elif "claude" in model.lower():
      response = call_claude(model, user_prompt)
    elif "gemini" in model.lower():
      response = call_gemini(model, user_prompt)
    elif model in HF_MODELS:
      response = call_hf(model, user_prompt)
    else:
      raise ValueError(f"Model {model} not supported")

    lines = [line.strip() for line in response.strip().splitlines() if line.strip().startswith("{")]

    return [json.loads(line) for line in lines]

  except Exception as e:
    raise Exception(f"Model query failed: {str(e)}")

# Save File

In [16]:
def save_dataset(file_name, records, file_format ):
  df = pd.DataFrame(records)
  print(df.shape)
  if file_format == "csv":
    df.to_csv(file_name, index=False)
  elif file_format == ".tsv":
    df.to_csv(file_name, sep='\t', index=False)
  elif file_format == ".jsonl":
    with open(file_name, 'w') as f:
      for record in records:
        f.write(json.dumps(record) + '\n')
  elif file_format == "json":
    df.to_json(file_name, orient='records', index=False)
  else:
    raise ValueError(f"File format {file_format} not supported")


# Generate Dataset

In [17]:
def generate_dataset(
  model_name,
  business_case,
  num_records = 100,
  schema = None,
  file_format = '.jsonl',
  file_name = 'sample_dataset.jsonl'
):
  """
  Generates a synthetic dataset using an LLM based on the given business case and optional schema.

  Returns:
      Tuple[str, pd.DataFrame | None]: A status message and a preview DataFrame (first 10 rows) if successful.
  """
  try:
    # Validate record count
    if num_records <= 10:
      return "‚ùå Error: Number of records must be greater than 10.", None
    if num_records > 1000:
      return "‚ùå Error: Number of records must be less than or equal to 1000.", None

    # Validate file format
    if file_format not in FILE_FORMATS:
      return f"‚ùå Error: Invalid file format '{file_format}'. Supported formats: {FILE_FORMATS}", None

    # Validate file extension
    if not file_name.endswith(file_format):
      file_name += file_format

    # Create prompt and call the appropriate model
    user_prompt = get_user_prompt(business_case, num_records, schema)
    records = choose_LLM(model_name, user_prompt)

    if not records:
      return "‚ùå Error: No valid records were generated by the model.", None

    # Save Dataset
    save_dataset(file_name, records, file_format)

    # Prepare preview
    df = pd.DataFrame(records)
    preview = df.head(10)

    success_message = (
        f"‚úÖ Generated {len(records)} records successfully!\n"
        f"üìÅ Saved to: {file_name}\n"
    )

    return success_message, preview

  except Exception as e:
    return f"‚ùå Error: {str(e)}", None



# Generate Gradio Interface

In [None]:
with gr.Blocks(title="Synthetic Data Generator") as interface:
  gr.Markdown("# Dataset Generator")
  gr.Markdown("Generate synthetic datasets using AI models")

  with gr.Row():
    with gr.Column(scale=2):
      schema_input = gr.Textbox(
          label="Schema",
          value=DEFAULT_SCHEMA_TEXT,
          lines=15,
          placeholder="Define your dataset schema here... Please follow this format: Name (TYPE) - Description, example: Example"
      )

      business_case_input = gr.Textbox(
          label="Business Case",
          value="I want to generate a toy store dataset for inventory",
          lines = 2,
          placeholder="Describe your business case here..."
      )

      with gr.Row():
        model_dropdown = gr.Dropdown(
            label="Model",
            choices=list(MODELS.keys()),
            value=list(MODELS.keys())[0],
            interactive=True
        )

        num_records_input = gr.Number(
            label="Number of Records",
            value=25,
            minimum=11,
            maximum=1000,
            step=1
        )

        with gr.Row():
          file_name_input = gr.Textbox(
              label="Save as",
              value="toystore_dataset",
              lines=1,
              placeholder="Name your dataset file here..."
          )

          file_format_dropdown = gr.Dropdown(
              label="File Format",
              choices=FILE_FORMATS,
              value=FILE_FORMATS[0],
              interactive=True
          )

        generate_btn = gr.Button("üöÄ Generate", variant="secondary", size="lg")

    with gr.Column(scale=1):
      gr.Markdown("""
      ### üìù Dataset Generation Instructions

      1. **üóÇ Schema** ‚Äì Define your dataset structure
        *(default: restaurant schema provided)*
      2. **üí° Business Case** ‚Äì Enter a prompt to guide the AI for generating data
      3. **ü§ñ Model** ‚Äì Choose your AI model: GPT, Claude, Gemini, or Hugging Face
      4. **üìä Number of Records** ‚Äì Specify entries to generate
        *(min: 11, max: 1000)*
      5. **üìÅ File Format** ‚Äì Select output type: `.csv`, `.tsv`, `.jsonl`, or `.json`
      6. **üíæ Save As** ‚Äì Provide a filename *(extension auto-added)*
      7. **üöÄ Generate** ‚Äì Click **Generate** to create your dataset

      ### üîß Requirements

      Set API keys in Colab‚Äôs secret section:
      `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `HF_TOKEN`
      """)
      output_status = gr.Textbox(
        label="Status",
        lines=4,
        interactive=False
      )

      output_preview = gr.Dataframe(
        label="Preview (first 10 rows)",
        interactive=False,
        wrap=True
      )

  generate_btn.click(
      fn=generate_dataset,
      inputs=[
        model_dropdown,
        business_case_input,
        num_records_input,
        schema_input,
        file_format_dropdown,
        file_name_input
      ],
      outputs=[output_status, output_preview]
  )

interface.launch(debug=True)

gc.collect()
torch.cuda.empty_cache()