# Create a start schema and populate it with fake data

we will use a retail scenario and create fake data for the start schema with following dimension tables: store, customer, product and the sollowing fact table: sales

In [None]:
# install faker locally (session scoped) to generate fake data
pip install faker faker-commerce

### Set the ADLS account name and the container name where the data will be generated in the data lake

In [None]:
# make sure to create the following before you modify the variables below.

# the ADLS account name, you could use your default ADLS account for this. 
# Please make sure that you have appropriate storage blob data contributor rights to this account
adls_account_name="saanalyticstftest"
# The container where the generated data will be stored, you need to also create this
adls_container_name="dataset" 

In [3]:
salesfact_path = "abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/salesfact".format(adls_fs_name=adls_container_name, adls_account=adls_account_name)
customerdim_path = "abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/customerdim".format(adls_fs_name=adls_container_name, adls_account=adls_account_name)
productdim_path = "abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/productdim".format(adls_fs_name=adls_container_name, adls_account=adls_account_name)
storedim_path = "abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/storedim".format(adls_fs_name=adls_container_name, adls_account=adls_account_name)

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from pyspark.sql import functions as F
from faker import Faker
import faker_commerce
import random

fake = Faker()
fake.add_provider(faker_commerce.Provider)
salesfact_data = []
customerdim_data = []
productdim_data = []
storedim_data = []

# create 100 products, store their id/keys for sales later
limit = 100
product_key_price = {}
for x in range(0, limit):
    product_key = fake.hexify(text='^^^^^^^^^^^^')
    product_name = fake.ecommerce_name()
    product_base_price = round(random.uniform(1000.00, 100000.00), 2)

    productdim_data.append( (product_key, product_name, product_base_price) )
    product_key_price[product_key] = product_base_price

# create 100 stores
limit = 100
for x in range(0, limit):
    store_key = fake.hexify(text='^^^^^^^^^^^^')
    store_city =  fake.city()
    store_country = fake.country()

    storedim_data.append(  (store_key, store_city, store_country) )
    
    # generate random 10-100 customers for each store
    for x in range(0, fake.random_int(min=10, max=100)):
        customer_key = fake.hexify(text='^^^^^^^^^^^^')
        customer_city = store_city
        customer_country = store_country
        customer_name = fake.name()

        customerdim_data.append( (customer_key, customer_city, customer_country, customer_name) )

        # generate random sales between 10-100 for each customer  
        for x in range(0, fake.random_int(min=1, max=10)):
            salesfact_key = fake.hexify(text='^^^^^^^^^^^^')
            product_key = fake.random_element(elements=product_key_price.keys())
            sales_price = product_key_price.get(product_key) + round(random.uniform(100.00, 1000.00), 2)
            sales_units = fake.random_int(min=1, max=7)
            sales_month = fake.month_name()
            sales_day = fake.day_of_month()
            sales_year = fake.random_int(min=2010, max=2022)

            salesfact_data.append( (salesfact_key, store_key, customer_key, product_key, sales_price, sales_units, sales_month, sales_day, sales_year) )
            

product_schema = StructType([       
    StructField('product_key', StringType(), True),
    StructField('product_name', StringType(), True),
    StructField('product_base_price', DoubleType(), True)
])

store_schema = StructType([       
    StructField('store_key', StringType(), True),
    StructField('store_city', StringType(), True),
    StructField('store_country', StringType(), True)
])

customer_schema = StructType([       
    StructField('customer_key', StringType(), True),
    StructField('customer_city', StringType(), True),
    StructField('customer_country', StringType(), True),
    StructField('customer_name', StringType(), True)
])

salesfact_schema = StructType([       
    StructField('salesfact_key', StringType(), True),
    StructField('store_key', StringType(), True),
    StructField('customer_key', StringType(), True),
    StructField('product_key', StringType(), True),
    StructField('sales_price', DoubleType(), True),
    StructField('sales_units', IntegerType(), True),
    StructField('sales_month', StringType(), True),
    StructField('sales_day', StringType(), True),
    StructField('sales_year', IntegerType(), True),
])


productDF = spark.createDataFrame(data=productdim_data, schema = product_schema)
storeDF = spark.createDataFrame(data=storedim_data, schema = store_schema)
customerDF = spark.createDataFrame(data=customerdim_data, schema = customer_schema)
salesDF = spark.createDataFrame(data=salesfact_data, schema = salesfact_schema)

### Write the generated data out to the data lake

In [6]:
productDF.write.mode("overwrite").format("parquet").save(productdim_path)
storeDF.write.mode("overwrite").format("parquet").save(storedim_path)
customerDF.write.mode("overwrite").format("parquet").save(customerdim_path)
salesDF.write.mode("overwrite").format("parquet").save(salesfact_path)