In [27]:
# ============================================================
# Import necessary Libraries
# ============================================================

import random
import uuid
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from collections import Counter

In [29]:
# ============================================================
# Core configuration
# ============================================================

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

N_SAMPLES = 3000

STYLES = [
    "formal_report",
    "technical_summary",
    "casual_observation",
    "speculative_analysis",
    "risk_alert"
]

TOPICS = [
    "infrastructure_load",
    "traffic_flow",
    "system_stability",
    "environmental_pressure",
    "resource_allocation"
]

SENTIMENTS = ["neutral", "cautious", "optimistic", "concerned"]

# ============================================================
# Language primitives (variation is critical for NLP)
# ============================================================

OPENINGS = {
    "formal_report": [
        "This report outlines",
        "The following assessment evaluates",
        "An analysis was conducted to determine"
    ],
    "technical_summary": [
        "Simulation results indicate",
        "Observed system metrics suggest",
        "Model outputs reveal"
    ],
    "casual_observation": [
        "It looks like",
        "We’re starting to see",
        "There seems to be"
    ],
    "speculative_analysis": [
        "One possible explanation is",
        "A plausible interpretation is",
        "It may be that"
    ],
    "risk_alert": [
        "Warning:",
        "Attention required:",
        "Potential issue detected:"
    ]
}

VERBS = [
    "increase", "decrease", "stabilize",
    "accelerate", "degrade", "fluctuate"
]

MODIFIERS = [
    "slightly", "moderately", "significantly",
    "unexpectedly", "gradually", "rapidly"
]

OUTCOMES = [
    "system performance",
    "network efficiency",
    "operational reliability",
    "resource utilization",
    "risk exposure"
]

HEDGING = [
    "may", "might", "could",
    "appears to", "is likely to", "cannot be ruled out"
]

# ============================================================
# Helper functions
# ============================================================

def random_date():
    start = datetime(2024, 1, 1)
    return start + timedelta(days=random.randint(0, 365))

def generate_numeric_context():
    return {
        "load_factor": round(np.random.beta(2, 5), 2),
        "capacity": random.randint(50, 300),
        "agents": random.randint(10, 500)
    }

def inject_noise(text):
    """Adds minor grammatical / stylistic noise"""
    if random.random() < 0.2:
        text += "."
    if random.random() < 0.15:
        text = text.replace(" is ", " was ")
    return text

# ============================================================
# Main text generation
# ============================================================

def generate_text(style, topic, sentiment, metrics):
    opening = random.choice(OPENINGS[style])
    verb = random.choice(VERBS)
    modifier = random.choice(MODIFIERS)
    outcome = random.choice(OUTCOMES)
    hedge = random.choice(HEDGING)

    base_sentence = (
        f"{opening} that {topic.replace('_', ' ')} "
        f"{hedge} {verb} {modifier}, impacting {outcome}."
    )

    numeric_sentence = (
        f" Current load is {metrics['load_factor']} "
        f"with {metrics['agents']} active agents "
        f"against a capacity of {metrics['capacity']}."
    )

    if sentiment == "concerned":
        numeric_sentence += " This raises concerns about future stability."
    elif sentiment == "optimistic":
        numeric_sentence += " Conditions remain within acceptable bounds."

    text = base_sentence + numeric_sentence
    return inject_noise(text)

# ============================================================
# Dataset generation
# ============================================================

def generate_dataset(n=N_SAMPLES):
    records = []

    for _ in range(n):
        style = random.choice(STYLES)
        topic = random.choice(TOPICS)
        sentiment = random.choice(SENTIMENTS)
        metrics = generate_numeric_context()

        record = {
            "id": str(uuid.uuid4()),
            "timestamp": random_date(),
            "style": style,
            "topic": topic,
            "sentiment": sentiment,
            "load_factor": metrics["load_factor"],
            "agents": metrics["agents"],
            "capacity": metrics["capacity"],
            "text": generate_text(style, topic, sentiment, metrics)
        }

        records.append(record)

    return pd.DataFrame(records)

# ============================================================
# Run
# ============================================================

if __name__ == "__main__":
    df = generate_dataset()
    print(df.sample(5))


                                        id  timestamp                 style  \
1297  f550fcc8-6b63-4441-99d7-98f5af8c28d5 2024-02-12            risk_alert   
2341  6d225ee9-1671-4f02-baf2-26c6b07e7631 2024-01-11  speculative_analysis   
2926  b503098e-184c-41f3-8aee-e17e828f99e8 2024-09-20    casual_observation   
2695  a14d296e-f77a-4328-83bf-e456d907f13e 2024-08-26    casual_observation   
2225  d30a7269-e59f-402f-9057-06aebe31df66 2024-08-09            risk_alert   

                       topic   sentiment  load_factor  agents  capacity  \
1297            traffic_flow   concerned         0.57     146       122   
2341        system_stability  optimistic         0.40     403       228   
2926  environmental_pressure     neutral         0.52     152       126   
2695  environmental_pressure   concerned         0.44     457        83   
2225     resource_allocation     neutral         0.33     346        85   

                                                   text  
1297  Attention 

In [31]:
# ============================================================
# Save to CSV
# ============================================================

df.to_csv("synthetic_nlp_dataset.csv", index=False)
print(f"Saved {len(df)} rows to synthetic_nlp_dataset.csv")

Saved 3000 rows to synthetic_nlp_dataset.csv


In [33]:
# ============================================================
# Validate the Data
# ============================================================

def validate_dataset(df):
    print("===== BASIC SHAPE =====")
    print(df.shape)
    print()

    print("===== MISSING VALUES =====")
    print(df.isnull().sum())
    print()

    print("===== LABEL DISTRIBUTIONS =====")
    for col in ["style", "topic", "sentiment"]:
        print(f"\n{col.upper()}")
        print(df[col].value_counts(normalize=True))
    print()

    print("===== DUPLICATE TEXT CHECK =====")
    dup_rate = df["text"].duplicated().mean()
    print(f"Duplicate text rate: {dup_rate:.3f}")
    assert dup_rate < 0.02, "Too many duplicate samples"
    print()

    print("===== TEXT LENGTH DIVERSITY =====")
    lengths = df["text"].str.split().apply(len)
    print(lengths.describe())
    print()

    print("===== NUMERIC ↔ TEXT SANITY CHECK =====")
    high_load = df[df["load_factor"] > 0.7]["text"]
    low_load = df[df["load_factor"] < 0.3]["text"]

    high_terms = Counter(" ".join(high_load).split()).most_common(10)
    low_terms = Counter(" ".join(low_load).split()).most_common(10)

    print("High-load frequent terms:", high_terms)
    print("Low-load frequent terms:", low_terms)
    print()

    print("✅ Dataset validation passed.")

# Run validation
validate_dataset(df)

===== BASIC SHAPE =====
(3000, 9)

===== MISSING VALUES =====
id             0
timestamp      0
style          0
topic          0
sentiment      0
load_factor    0
agents         0
capacity       0
text           0
dtype: int64

===== LABEL DISTRIBUTIONS =====

STYLE
style
formal_report           0.210667
speculative_analysis    0.202667
casual_observation      0.199667
risk_alert              0.196000
technical_summary       0.191000
Name: proportion, dtype: float64

TOPIC
topic
resource_allocation       0.209667
traffic_flow              0.204333
system_stability          0.199000
infrastructure_load       0.193667
environmental_pressure    0.193333
Name: proportion, dtype: float64

SENTIMENT
sentiment
cautious      0.267667
concerned     0.246667
neutral       0.243667
optimistic    0.242000
Name: proportion, dtype: float64

===== DUPLICATE TEXT CHECK =====
Duplicate text rate: 0.000

===== TEXT LENGTH DIVERSITY =====
count    3000.000000
mean       29.180667
std         3.218925
mi