# Domain Name Generator: Dataset Creation

This notebook demonstrates how to create synthetic training data for domain name generation models.

## Overview
- Generate diverse business descriptions
- Create positive and negative domain examples
- Prepare data for model training
- Generate edge cases for testing

In [None]:
import sys
import os
sys.path.append('../src')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from domain_generator.data.synthetic_generator import SyntheticDataGenerator
from domain_generator.utils.config import Config

## 1. Initialize Data Generator

In [None]:
# Initialize configuration and generator
config = Config()
generator = SyntheticDataGenerator()

print(f"Business types: {list(generator.vocabulary['business_types'].keys())}")
print(f"Complexity levels: {list(generator.business_templates.keys())}")

## 2. Generate Sample Business Descriptions

In [None]:
# Generate sample business descriptions
sample_businesses = []

for business_type in ['restaurants', 'tech_startups', 'healthcare']:
    for complexity in ['simple', 'medium', 'complex']:
        business = generator.generate_business_description(business_type, complexity)
        sample_businesses.append({
            'type': business_type,
            'complexity': complexity,
            'description': business.description,
            'keywords': business.keywords
        })

# Display samples
for business in sample_businesses:
    print(f"Type: {business['type']}, Complexity: {business['complexity']}")
    print(f"Description: {business['description']}")
    print(f"Keywords: {business['keywords']}")
    print("-" * 80)

## 3. Generate Domain Suggestions Examples

In [None]:
# Generate domain suggestions for a sample business
sample_business = generator.generate_business_description('tech_startups', 'medium')

print(f"Business: {sample_business.description}")
print(f"Keywords: {sample_business.keywords}")
print()

# Generate different quality domains
good_domains = generator.generate_domain_suggestions(sample_business, 'good')
mediocre_domains = generator.generate_domain_suggestions(sample_business, 'mediocre')
bad_domains = generator.generate_domain_suggestions(sample_business, 'bad')

print("Good domains:")
for domain in good_domains:
    print(f"  {domain}")

print("\nMediocre domains:")
for domain in mediocre_domains:
    print(f"  {domain}")

print("\nBad domains:")
for domain in bad_domains:
    print(f"  {domain}")

## 4. Generate Training Dataset

In [None]:
# Generate training dataset
print("Generating training dataset...")
dataset = generator.generate_training_dataset(num_samples=1000)

print(f"Generated {len(dataset)} training examples")

# Show sample training example
sample = dataset[0]
print("\nSample training example:")
print(f"Business: {sample['business_description']}")
print(f"Type: {sample['business_type']}")
print(f"Complexity: {sample['complexity']}")
print(f"Keywords: {sample['keywords']}")
print(f"Good domains: {sample['good_domains']}")
print("\nTraining prompt:")
print(sample['prompt'])
print("\nExpected completion:")
print(sample['completion'])

## 5. Dataset Analysis

In [None]:
# Analyze dataset composition
df = pd.DataFrame(dataset)

print("Dataset Statistics:")
print(f"Total samples: {len(df)}")
print(f"Business types: {df['business_type'].nunique()}")
print(f"Complexity levels: {df['complexity'].nunique()}")

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Business type distribution
df['business_type'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Business Type Distribution')
axes[0].set_xlabel('Business Type')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Complexity distribution
df['complexity'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Complexity Level Distribution')
axes[1].set_xlabel('Complexity')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Analyze domain quality distribution
all_good_domains = []
all_bad_domains = []

for example in dataset:
    all_good_domains.extend(example['good_domains'])
    all_bad_domains.extend(example['bad_domains'])

print(f"Total good domains generated: {len(all_good_domains)}")
print(f"Total bad domains generated: {len(all_bad_domains)}")

# Analyze domain lengths
good_lengths = [len(domain.split('.')[0]) for domain in all_good_domains]
bad_lengths = [len(domain.split('.')[0]) for domain in all_bad_domains]

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(good_lengths, bins=20, alpha=0.7, label='Good domains', color='green')
plt.hist(bad_lengths, bins=20, alpha=0.7, label='Bad domains', color='red')
plt.xlabel('Domain Name Length (characters)')
plt.ylabel('Frequency')
plt.title('Domain Length Distribution')
plt.legend()

# Analyze TLD distribution
good_tlds = [domain.split('.')[-1] for domain in all_good_domains]
tld_counts = pd.Series(good_tlds).value_counts()

plt.subplot(1, 2, 2)
tld_counts.plot(kind='bar')
plt.title('TLD Distribution in Good Domains')
plt.xlabel('TLD')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 6. Generate Edge Cases

In [None]:
# Generate edge cases for testing
print("Generating edge cases...")
edge_cases = generator.generate_edge_cases(num_cases=200)

print(f"Generated {len(edge_cases)} edge cases")

# Show edge case types
edge_case_types = {}
for case in edge_cases:
    case_type = case['type']
    edge_case_types[case_type] = edge_case_types.get(case_type, 0) + 1

print("\nEdge case distribution:")
for case_type, count in edge_case_types.items():
    print(f"  {case_type}: {count}")

# Show sample edge cases
print("\nSample edge cases:")
for case_type in ['very_short', 'very_long', 'technical_jargon']:
    sample_case = next((case for case in edge_cases if case['type'] == case_type), None)
    if sample_case:
        print(f"\n{case_type.upper()}:")
        print(f"  Description: {sample_case['business_description']}")
        print(f"  Expected issues: {sample_case['expected_issues']}")

## 7. Save Generated Data

In [None]:
# Create data directories
data_dir = Path('../data')
processed_dir = data_dir / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

# Save training dataset
train_path = processed_dir / 'training_dataset.json'
generator.save_dataset(dataset, str(train_path))
print(f"Training dataset saved to: {train_path}")

# Save edge cases
edge_path = processed_dir / 'edge_cases.json'
generator.save_dataset(edge_cases, str(edge_path))
print(f"Edge cases saved to: {edge_path}")

# Save dataset statistics
stats = {
    'total_training_samples': len(dataset),
    'total_edge_cases': len(edge_cases),
    'business_types': list(df['business_type'].value_counts().to_dict().items()),
    'complexity_levels': list(df['complexity'].value_counts().to_dict().items()),
    'edge_case_types': list(edge_case_types.items()),
    'avg_good_domains_per_sample': len(all_good_domains) / len(dataset),
    'avg_domain_length': sum(good_lengths) / len(good_lengths)
}

stats_path = processed_dir / 'dataset_stats.json'
with open(stats_path, 'w') as f:
    json.dump(stats, f, indent=2)

print(f"Dataset statistics saved to: {stats_path}")
print("\nDataset creation complete!")

## Summary

In this notebook, we:

1. **Created a synthetic data generator** that produces diverse business descriptions
2. **Generated training examples** with good, mediocre, and bad domain suggestions
3. **Analyzed the dataset** to ensure good distribution across business types and complexity levels
4. **Created edge cases** for testing model robustness
5. **Saved all data** for use in model training and evaluation

The generated dataset includes:
- 1,000 training examples across 10 business types
- 3 complexity levels (simple, medium, complex)
- Multiple domain quality levels for each business description
- 200 edge cases covering various failure modes

This synthetic dataset provides a solid foundation for training domain generation models while ensuring comprehensive coverage of different business scenarios and potential edge cases.