# Synthetic Dataset Generation for Domain Name Suggestions

## Objective
Generate a synthetic dataset mapping business descriptions to domain name suggestions for LLM fine-tuning.

## Features
- Diverse business categories (Food, Tech, Education, etc.)
- Randomized geographic locations
- Multiple domain suffixes
- Supports JSON and CSV export

## Improvements Added
- Stopword removal for cleaner names
- Brand-style domain suggestions
- Edge cases: short, long, and ambiguous descriptions
- Safety filter for inappropriate content

In [17]:
import os
import re
import json
import random
import pandas as pd

In [18]:
# List of business types to generate domain names for
business_types = [
    # Food & Beverage
    "organic coffee shop", "vegan restaurant", "fast-food chain", "food truck business",
    "fine dining restaurant", "ice cream parlor", "bakery and pastry shop",
    # Tech & Digital
    "AI consulting firm", "blockchain startup", "mobile app development company",
    "cybersecurity service provider", "web design agency", "cloud solutions provider",
    # Health & Fitness
    "fitness center", "yoga studio", "mental health counseling service", "nutrition coaching",
    "physiotherapy clinic", "telemedicine platform",
    # Retail & E-commerce
    "online bookstore", "luxury watch retailer", "fashion boutique", "electronics store",
    "home decor shop", "organic skincare brand",
    # Education
    "online learning platform", "language tutoring service", "coding bootcamp",
    "educational game developer", "university consultancy",
    # Entertainment & Media
    "gaming cafe", "video streaming platform", "podcast production company",
    "film production house", "music label", "digital art marketplace",
    # Services
    "digital marketing agency", "SEO optimization firm", "legal consultancy",
    "real estate agency", "wedding photography business", "event planning company",
    # Finance
    "cryptocurrency exchange", "personal finance advisory", "insurance brokerage",
    "micro-lending platform", "investment firm",
    # Nonprofit & Social
    "nonprofit for education", "animal rescue organization", "environmental NGO",
    "health awareness campaign", "youth development program",
    # Miscellaneous
    "car rental service", "luxury travel agency", "pet grooming service", "home cleaning service",
    "coworking space", "VR gaming center"
]

In [19]:
# Add edge cases
edge_cases = [
    "coffee",  # very short
    "AI-powered healthcare solution for elderly in rural areas",  # very long
    "startup for pets",  # ambiguous
    "education platform & training program with global outreach"  # complex
]
business_types.extend(edge_cases)

In [20]:
# List of domain suffixes
domain_suffixes = [".com", ".net", ".org", ".co", ".ai", ".io", ".biz", ".store"]

In [21]:
#function to generate domain name suggestions based on business description
stopwords = {"the", "and", "in", "for", "with", "on", "of"}

def clean_keywords(text):
    words = [w.lower() for w in text.split() if w.lower() not in stopwords]
    return words[:3]  # take up to 3 keywords

def brandify(word):
    suffixes = ["ify", "hub", "zone", "io", "base", "link"]
    return word + random.choice(suffixes)

def generate_domain_suggestions(description):
    keywords = clean_keywords(description)
    suggestions = set()

    while len(suggestions) < 3:
        if random.random() > 0.5 and len(keywords) >= 2:
            # Combine two keywords
            base_name = keywords[0] + keywords[1]
        else:
            # Use brand-style name
            base_name = brandify(random.choice(keywords))

        domain = base_name + random.choice(domain_suffixes)
        suggestions.add(re.sub(r'[^a-z0-9\.]', '', domain))  # remove invalid chars

    return list(suggestions)

In [22]:
inappropriate_keywords = {"adult", "porn", "gambling", "violence"}

def is_safe(description):
    return not any(word in description.lower() for word in inappropriate_keywords)

dataset = []
for i in range(1000):
    description = random.choice(business_types) + " in " + random.choice(
        ["New York", "Paris", "Tokyo", "London", "Berlin", "Dubai", "San Francisco", "Toronto", "Singapore"]
    )

    if not is_safe(description):
        continue  # skip unsafe examples

    domains = generate_domain_suggestions(description)

    dataset.append({
        "business_description": description,
        "expected_domain_names": domains
    })

print(f"Dataset created with {len(dataset)} examples.")

Dataset created with 1000 examples.


In [23]:
# Save as JSON
with open("../data/synthetic_dataset_v1.json", "w") as f:
    json.dump(dataset, f, indent=2)

# Save as CSV
df = pd.DataFrame(dataset)
df.to_csv("../data/synthetic_dataset_v1.csv", index=False)

print("Dataset saved in JSON and CSV formats.")

Dataset saved in JSON and CSV formats.
