In [0]:
%pip install fastavro

In [0]:
dbutils.library.restartPython()

In [0]:

import pandas as pd
import numpy as np
import os
from fastavro import writer, parse_schema

# # 🔧 Change this path to your local drive folder
output_dir = "/tmp/dataset"
os.makedirs(output_dir, exist_ok=True)

# Total rows per file (~500MB target)
ROWS = 20_000_000
CHUNK_SIZE = 1_000_000  # generate 1M rows at a time

# -----------------------------
# Utility writers for multiple formats
# -----------------------------
def write_json(df, filename):
    df.to_json(filename, orient="records", lines=True)

def write_parquet(df, filename):
    df.to_parquet(filename, engine="pyarrow", index=False)

def write_orc(df, filename):
    df.to_orc(filename, engine="pyarrow", index=False)

def write_avro(df, filename):
    schema = {
        "type": "record",
        "name": "Dataset",
        "fields": [{"name": col, "type": ["null", "string"]} for col in df.columns]
    }
    parsed_schema = parse_schema(schema)
    records = df.astype(str).to_dict("records")
    with open(filename, "wb") as out:
        writer(out, parsed_schema, records)

def write_txt(df, filename):
    df.to_csv(filename, sep="\t", index=False, header=True)

# -----------------------------
# Main function to generate dataset in multiple formats
# -----------------------------
def generate_dataset(base_name, delimiter, schema_func):
    csv_path = os.path.join(output_dir, base_name + ".csv")
    json_path = os.path.join(output_dir, base_name + ".json")
    parquet_path = os.path.join(output_dir, base_name + ".parquet")
    orc_path = os.path.join(output_dir, base_name + ".orc")
    avro_path = os.path.join(output_dir, base_name + ".avro")
    txt_path = os.path.join(output_dir, base_name + ".txt")

    print(f"🚀 Generating dataset: {base_name}")

    if os.path.exists(csv_path):
        os.remove(csv_path)

    written_rows = 0
    while written_rows < ROWS:
        rows_to_write = min(CHUNK_SIZE, ROWS - written_rows)
        df = schema_func(rows_to_write)

        # Append CSV in chunks
        df.to_csv(csv_path, mode="a", index=False, sep=delimiter, header=(written_rows == 0))

        # Write other formats once (first chunk)
        if written_rows == 0:
            write_json(df, json_path)
            write_parquet(df, parquet_path)
            write_orc(df, orc_path)
            write_avro(df, avro_path)
            write_txt(df, txt_path)

        written_rows += rows_to_write
        size_mb = os.path.getsize(csv_path) / (1024 * 1024)
        print(f"   ➡ {written_rows:,} rows written, current CSV size = {size_mb:.2f} MB")

    print(f"✅ {base_name} ready in all formats!\n")

# -----------------------------
# Dataset schemas (datetime as string)
# -----------------------------
def sales_schema(n):
    products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Printer"]
    df = pd.DataFrame({
        "Date": pd.date_range("2023-01-01", periods=n, freq="S").astype(str),
        "Product": np.random.choice(products, size=n),
        "Units_Sold": np.random.randint(1, 100, size=n),
        "Revenue": np.random.randint(100, 5000, size=n)
    })
    return df

def web_schema(n):
    countries = ["India", "USA", "UK", "Germany", "Canada"]
    df = pd.DataFrame({
        "Date": pd.date_range("2023-01-01", periods=n, freq="S").astype(str),
        "Visitors": np.random.randint(1000, 10000, size=n),
        "PageViews": np.random.randint(2000, 20000, size=n),
        "Country": np.random.choice(countries, size=n)
    })
    return df

def employee_schema(n):
    emp_ids = [f"E{i:05}" for i in range(1, 1001)]
    df = pd.DataFrame({
        "Emp_ID": np.random.choice(emp_ids, size=n),
        "Projects_Completed": np.random.randint(0, 50, size=n),
        "Hours_Worked": np.random.randint(100, 300, size=n),
        "Performance_Score": np.random.randint(1, 10, size=n)
    })
    return df

def feedback_schema(n):
    products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Printer"]
    df = pd.DataFrame({
        "Feedback_ID": np.arange(1, n+1),
        "Product": np.random.choice(products, size=n),
        "Rating": np.random.randint(1, 6, size=n),
        "Comment": np.random.choice(["Good", "Average", "Poor", "Excellent"], size=n)
    })
    return df

def project_schema(n):
    projects = [f"P{i:04}" for i in range(1, 500)]
    status = ["On Track", "Delayed", "Completed"]
    df = pd.DataFrame({
        "Project_ID": np.random.choice(projects, size=n),
        "Task_Count": np.random.randint(5, 100, size=n),
        "Completed_Tasks": np.random.randint(0, 100, size=n),
        "Status": np.random.choice(status, size=n)
    })
    return df

def finance_schema(n):
    dates = pd.date_range("2020-01-01", "2025-12-31", freq="D").astype(str)
    df = pd.DataFrame({
        "Month": np.random.choice(dates, size=n),
        "Revenue": np.random.randint(10000, 500000, size=n),
        "Expenses": np.random.randint(5000, 300000, size=n),
        "Profit": np.random.randint(1000, 200000, size=n)
    })
    return df

def inventory_schema(n):
    items = ["ItemA", "ItemB", "ItemC", "ItemD"]
    df = pd.DataFrame({
        "Item_ID": np.random.choice(items, size=n),
        "Stock_Level": np.random.randint(0, 1000, size=n),
        "Reorder_Level": np.random.randint(50, 200, size=n),
        "Warehouse": np.random.choice(["WH1", "WH2", "WH3"], size=n)
    })
    return df

def marketing_schema(n):
    campaigns = [f"CMP{i:04}" for i in range(1, 200)]
    df = pd.DataFrame({
        "Campaign_ID": np.random.choice(campaigns, size=n),
        "Leads_Generated": np.random.randint(50, 5000, size=n),
        "Conversions": np.random.randint(0, 500, size=n),
        "Spend": np.random.randint(1000, 50000, size=n)
    })
    return df

def healthcare_schema(n):
    patients = [f"PT{i:06}" for i in range(1, 5000)]
    df = pd.DataFrame({
        "Patient_ID": np.random.choice(patients, size=n),
        "Visits": np.random.randint(1, 20, size=n),
        "Recovery_Days": np.random.randint(1, 100, size=n),
        "Treatment_Success": np.random.choice([0, 1], size=n)
    })
    return df

def education_schema(n):
    students = [f"S{i:06}" for i in range(1, 5000)]
    df = pd.DataFrame({
        "Student_ID": np.random.choice(students, size=n),
        "Subject": np.random.choice(["Math", "Science", "History", "English"], size=n),
        "Score": np.random.randint(0, 100, size=n),
        "Attendance": np.random.randint(50, 100, size=n)
    })
    return df

# -----------------------------
# List of datasets
# -----------------------------
datasets = [
    ("sales", ",", sales_schema),
    ("web_analytics", ";", web_schema),
    ("employee_performance", "|", employee_schema),
    ("customer_feedback", ":", feedback_schema),
    ("project_tracking", "\t", project_schema),
    ("finance_report", "~", finance_schema),
    ("inventory", "^", inventory_schema),
    ("marketing_campaign", "@", marketing_schema),
    ("healthcare", "#", healthcare_schema),
    ("education", " ", education_schema),
]

# -----------------------------
# Generate all datasets
# -----------------------------
for name, delim, schema in datasets:
    generate_dataset(name, delim, schema)

print("🎉 All datasets generated in CSV, JSON, Parquet, ORC, Avro, TXT!")


In [0]:
import os

files = os.listdir("/tmp/dataset")
print(files)

In [0]:
df = spark.read.csv(
    "/tmp/sales.csv",
    header=True,
    inferSchema=True
)
display(df)