In [3]:
%pip install dbldatagen --quiet 
%pip install jmespath --quiet
# Generate Large Sample Data In Fabric
import dbldatagen as dg
from pyspark.sql.types import StructType, StructField,  StringType
import pandas as pd
from pyspark.sql.functions import col, last_day, dayofweek, year, month, date_format

StatementMeta(, 69b2c919-b3ea-4dc3-b7fa-3b35855536ef, 23, Finished, Available)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



# Parameters

In [122]:
country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, 17]

colors = ['White','Black','Grey','Silver','Orange','Yellow','Blue','Brown','Gold','Silver Grey','Pink','Red','Green','Transparent','Purple','Azure']
categories = ['Audio','TV and Video','Computers','Cell phones','Music, Movies and Audio Books','Home Appliances','Cameras and camcorders','Games and Toys']
subCategories = ['MP4&MP3','Home Theater System','Projectors & Screens','Computers Accessories','Home & Office Phones','Movie DVD','Washers & Dryers','Microwaves','Water Heaters','Coffee Machines','Air Conditioners','Digital SLR Cameras','Touch Screen Phones','Cell phones Accessories','Camcorders','VCD & DVD','Car Video','Boxed Games','Bluetooth Headphones','Digital Cameras','Smart phones & PDAs','Televisions','Laptops','Desktops','Monitors','Printers, Scanners & Fax','Refrigerators','Lamps','Fans','Cameras & Camcorders Accessories','Recording Pen','Download Games']

start_date = '2020-01-01 00:00:00'
end_date = '2024-12-31 00:00:00'


customer_rows = 1000
store_rows = 100
product_rows = 100000
sales_rows = 100000000


StatementMeta(, 69b2c919-b3ea-4dc3-b7fa-3b35855536ef, 143, Finished, Available)

# Generate tables

In [123]:
listOfTables = []

# Customer table

dataSpec = (
    dg.DataGenerator(spark, name="customerDataset", rows=customer_rows)
    .withIdOutput()
    .withColumn("customer", template=r"\\w \\w")
    .withColumn("email", template=r"\\w.\\w@\\w.com")    
    .withColumn("country", StringType(), values=country_codes, weights=country_weights, random=True)
    .withColumn("birthday", "date", data_range=dg.DateRange("1942-01-01 00:00:00", "2010-10-06 11:55:00", "days=3"), random=True)
    .withColumn("gender", StringType(), values=["male", "female"], random=True)
    )

df = dataSpec.build()

listOfTables.append({'name': 'customer', 'data': df})

# store table

dataSpec = (
    dg.DataGenerator(spark, name="storeDataset", rows=store_rows)
    .withIdOutput()    
    .withColumn("store", template=r"\\w \\w \\w")
    .withColumn("openDate", "date", data_range=dg.DateRange("1942-01-01 00:00:00", "2010-10-06 11:55:00", "days=3"), random=True)        
    .withColumn("status", StringType(), values=["open", "closed"], weights=[100,10], random=True)
    )

df = dataSpec.build()

listOfTables.append({'name': 'store', 'data': df})

# product table

dataSpec = (
    dg.DataGenerator(spark, name="productDataset", rows=product_rows)
    .withIdOutput()    
    .withColumn("product", template=r"\\w \\w \\w")    
    .withColumn("color", StringType(), values=colors, random=True)
    .withColumn("category", StringType(), values=categories, random=True)
    .withColumn("unitPrice", 'decimal(10,2)', minValue=1, maxValue=100, random=True)
    .withColumn("weight", 'decimal(10,2)', minValue=1, maxValue=50, step=0.1, random=True)    
    )

df = dataSpec.build()

listOfTables.append({'name': 'product', 'data': df})

# sales table

dataSpec = (
    dg.DataGenerator(spark, name="salesDataset", rows=sales_rows)
    .withIdOutput()
    .withColumn("customerId", IntegerType(),  minValue=0, maxValue=customer_rows, random=True)
    .withColumn("productId", IntegerType(),  minValue=0, maxValue=product_rows, random=True)
    .withColumn("storeId", IntegerType(),  minValue=0, maxValue=store_rows, random=True)    
    .withColumn("orderDate", "date", data_range=dg.DateRange(start_date, end_date, "days=3"), random=True)
    .withColumn("shippingDate","date", expr="date_add(orderDate, cast(floor(rand() * 20 + 1) as int))", baseColumn=["orderDate"])
    .withColumn("quantity", IntegerType(),  minValue=0, maxValue=500, random=True)    
    .withColumn("price", 'decimal(10,2)', minValue=1, maxValue=10, random=True)
    .withColumn("amount","decimal(10,2)", expr="quantity*price", baseColumn=["quantity", "price"])
    )

df = dataSpec.build()

listOfTables.append({'name': 'sales', 'data': df})

# date dimension

date_df = pd.date_range(start=start_date, end=end_date).to_frame(index=False, name='Date')
date_df['Date'] = date_df['Date'].astype(str)

dfCalendarData = spark.createDataFrame(date_df)

dfCalendarData = dfCalendarData.withColumn('date', col('Date').cast('date'))
dfCalendarData = dfCalendarData.withColumn('dateID', date_format(col('Date'),"yyyyMMdd").cast('integer'))
dfCalendarData = dfCalendarData.withColumn('monthly', date_format(col('Date'),"yyyy-MM-01").cast('date'))
dfCalendarData = dfCalendarData.withColumn('month', date_format(col('Date'),"MMM"))
dfCalendarData = dfCalendarData.withColumn('monthYear', date_format(col('Date'),"MMM yyyy"))
dfCalendarData = dfCalendarData.withColumn('monthOfYear', month(col('Date')))
dfCalendarData = dfCalendarData.withColumn('year', year(col('Date')))
dfCalendarData = dfCalendarData.withColumn('dayOfWeekNum', dayofweek(col('Date')))
dfCalendarData = dfCalendarData.withColumn('dayOfWeek', date_format(col('Date'),"EE"))

listOfTables.append({'name': 'calendar', 'data': dfCalendarData})



StatementMeta(, 69b2c919-b3ea-4dc3-b7fa-3b35855536ef, 144, Finished, Available)

# Save to Lakehouse

In [124]:
listOfTables

for table in listOfTables:

    tableName = table['name']

    print(f"Saving table '{tableName}'")

    df = table['data']

    df.write.format("delta").mode("overwrite").saveAsTable(tableName)    

StatementMeta(, 69b2c919-b3ea-4dc3-b7fa-3b35855536ef, 145, Finished, Available)

Saving table 'customer'
Saving table 'store'
Saving table 'product'
Saving table 'sales'
Saving table 'calendar'


# Explore data

In [133]:
for table in listOfTables:
    tableName = table['name']

    spark.sql(f"SELECT count(*) as {tableName} FROM {tableName}").show()

StatementMeta(, 69b2c919-b3ea-4dc3-b7fa-3b35855536ef, 154, Finished, Available)

+--------+
|customer|
+--------+
|    1000|
+--------+

+-----+
|store|
+-----+
|  100|
+-----+

+-------+
|product|
+-------+
| 100000|
+-------+

+---------+
|    sales|
+---------+
|100000000|
+---------+

+--------+
|calendar|
+--------+
|    1827|
+--------+

