In [0]:
%pip install dbldatagen

Python interpreter will be restarted.
Collecting dbldatagen
  Downloading dbldatagen-0.3.5-py3-none-any.whl (86 kB)
Installing collected packages: dbldatagen
Successfully installed dbldatagen-0.3.5
Python interpreter will be restarted.


In [0]:
%pip install Faker

Python interpreter will be restarted.
Collecting Faker
  Downloading Faker-19.6.2-py3-none-any.whl (1.7 MB)
Installing collected packages: Faker
Successfully installed Faker-19.6.2
Python interpreter will be restarted.


In [0]:
from dbldatagen import DataGenerator, PyfuncText, DateRange
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("DataGeneration").getOrCreate()

brand_values = ["Brand1", "Brand2", "Brand3", "Brand4", "Brand5"]
courier_company_values = ["CourierCo1", "CourierCo2", "CourierCo3", "CourierCo4", "CourierCo5"]
channel_name_values = ["Channel1", "Channel2", "Channel3", "Channel4", "Channel5"]

# Define the number of rows and partitions for data generation
partitions_requested = 2
data_rows = 10

# Define the schema for the synthetic data
schema = StructType([
    StructField("tracking_id", IntegerType(), False),
    StructField("awb_assigned_date", TimestampType(), True),
    StructField("created_at", TimestampType(), True),
    StructField("channel_name", StringType(), True),
    StructField("sku_code", StringType(), True),
    StructField("address_pincode", StringType(), True),
    StructField("status", StringType(), True),
    StructField("courier_company", StringType(), True),
    StructField("pickup_scheduled_date", TimestampType(), True),
    StructField("order_picked_up_date", TimestampType(), True),
    StructField("order_shipped_date", TimestampType(), True),
    StructField("edd", TimestampType(), True),
    StructField("order_delivered_date", TimestampType(), True),
    StructField("rto_initiated_date", TimestampType(), True),
    StructField("rto_delivered_date", TimestampType(), True),
    StructField("freight_total_amount", IntegerType(), True),  # Use IntegerType() for integer columns
    StructField("first_out_for_delivery_date", TimestampType(), True),
    StructField("first_pickup_scheduled_date", TimestampType(), True),
    StructField("1st_attempt_date", TimestampType(), True),
    StructField("2nd_attempt_date", TimestampType(), True),
    StructField("3rd_attempt_date", TimestampType(), True),
    StructField("brand", StringType(), True),
    StructField("dt", StringType(), True)
])

# Create a DataGenerator for synthetic data generation
shipping_data_generator = (
    DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
    .withSchema(schema)
    .withColumnSpec("tracking_id", minValue=1, maxValue=2000, step=1)
    .withColumnSpec("awb_assigned_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("created_at", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("channel_name", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("sku_code", text=PyfuncText(lambda context, v: Faker().ean8()))
    .withColumnSpec("address_pincode", text=PyfuncText(lambda context, v: Faker().zipcode()))
    .withColumnSpec("status", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("courier_company", text=PyfuncText(lambda context, v: Faker().company()))
    .withColumnSpec("pickup_scheduled_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("order_picked_up_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("order_shipped_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("edd", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("order_delivered_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("rto_initiated_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("rto_delivered_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("freight_total_amount", text=PyfuncText(lambda context, v: str(Faker().random_int(min=1, max=1000))))
    .withColumnSpec("first_out_for_delivery_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("first_pickup_scheduled_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("1st_attempt_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("2nd_attempt_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("3rd_attempt_date", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("brand", text=PyfuncText(lambda context, v: Faker().company_suffix()))
    .withColumnSpec("dt", text=PyfuncText(lambda context, v: Faker().word()))
)

# Build the synthetic data DataFrame
shipping_data = shipping_data_generator.build()

# Display the first 10 rows of the generated data
display(shipping_data.limit(10))


tracking_id,awb_assigned_date,created_at,channel_name,sku_code,address_pincode,status,courier_company,pickup_scheduled_date,order_picked_up_date,order_shipped_date,edd,order_delivered_date,rto_initiated_date,rto_delivered_date,freight_total_amount,first_out_for_delivery_date,first_pickup_scheduled_date,1st_attempt_date,2nd_attempt_date,3rd_attempt_date,brand,dt
1,2015-11-27T00:00:00.000+0000,2012-03-20T00:00:00.000+0000,full,39678683,39920,teach,Davis-Suarez,2014-10-12T00:00:00.000+0000,2014-07-09T00:00:00.000+0000,2012-06-29T00:00:00.000+0000,2011-05-17T00:00:00.000+0000,2023-05-02T00:00:00.000+0000,2012-03-06T00:00:00.000+0000,2012-02-05T00:00:00.000+0000,0,2016-01-23T00:00:00.000+0000,2015-07-19T00:00:00.000+0000,2011-08-07T00:00:00.000+0000,2023-10-07T00:00:00.000+0000,2022-06-18T00:00:00.000+0000,and Sons,raise
2,2020-07-09T00:00:00.000+0000,2012-12-10T00:00:00.000+0000,you,28022718,5737,season,Williamson and Sons,2017-03-17T00:00:00.000+0000,2018-08-27T00:00:00.000+0000,2022-08-02T00:00:00.000+0000,2018-04-10T00:00:00.000+0000,2015-02-10T00:00:00.000+0000,2015-06-26T00:00:00.000+0000,2021-02-27T00:00:00.000+0000,1,2013-11-08T00:00:00.000+0000,2021-10-17T00:00:00.000+0000,2015-04-07T00:00:00.000+0000,2023-11-27T00:00:00.000+0000,2013-04-20T00:00:00.000+0000,Group,authority
3,2023-07-19T00:00:00.000+0000,2021-10-09T00:00:00.000+0000,newspaper,65229941,13008,yourself,Griffin and Sons,2020-03-30T00:00:00.000+0000,2015-04-29T00:00:00.000+0000,2018-07-13T00:00:00.000+0000,2013-09-04T00:00:00.000+0000,2020-01-10T00:00:00.000+0000,2021-09-27T00:00:00.000+0000,2014-11-26T00:00:00.000+0000,2,2020-03-07T00:00:00.000+0000,2022-01-27T00:00:00.000+0000,2013-04-30T00:00:00.000+0000,2022-04-03T00:00:00.000+0000,2022-05-03T00:00:00.000+0000,Ltd,media
4,2022-08-20T00:00:00.000+0000,2021-02-12T00:00:00.000+0000,adult,63667158,37928,wind,"Bender, Ashley and Bryant",2015-07-26T00:00:00.000+0000,2012-02-12T00:00:00.000+0000,2019-10-09T00:00:00.000+0000,2018-02-26T00:00:00.000+0000,2022-02-04T00:00:00.000+0000,2023-07-27T00:00:00.000+0000,2012-03-17T00:00:00.000+0000,3,2013-09-23T00:00:00.000+0000,2013-03-01T00:00:00.000+0000,2013-08-16T00:00:00.000+0000,2016-06-08T00:00:00.000+0000,2016-07-05T00:00:00.000+0000,and Sons,trouble
5,2020-10-28T00:00:00.000+0000,2010-02-09T00:00:00.000+0000,keep,62599207,7949,work,Harrington-Edwards,2011-12-14T00:00:00.000+0000,2010-09-23T00:00:00.000+0000,2016-10-23T00:00:00.000+0000,2016-06-21T00:00:00.000+0000,2016-08-21T00:00:00.000+0000,2011-04-05T00:00:00.000+0000,2011-07-25T00:00:00.000+0000,4,2014-06-25T00:00:00.000+0000,2017-07-25T00:00:00.000+0000,2013-07-01T00:00:00.000+0000,2013-01-13T00:00:00.000+0000,2022-01-15T00:00:00.000+0000,and Sons,citizen
6,2018-06-22T00:00:00.000+0000,2019-10-02T00:00:00.000+0000,behavior,97471516,60696,only,Gross-Perry,2013-08-30T00:00:00.000+0000,2012-11-01T00:00:00.000+0000,2020-03-09T00:00:00.000+0000,2010-08-09T00:00:00.000+0000,2022-09-09T00:00:00.000+0000,2020-11-17T00:00:00.000+0000,2021-08-24T00:00:00.000+0000,5,2013-08-15T00:00:00.000+0000,2023-06-10T00:00:00.000+0000,2019-09-09T00:00:00.000+0000,2014-04-13T00:00:00.000+0000,2014-10-05T00:00:00.000+0000,Inc,member
7,2011-03-26T00:00:00.000+0000,2017-02-27T00:00:00.000+0000,watch,31445078,27902,forward,Stafford Group,2014-05-03T00:00:00.000+0000,2018-05-02T00:00:00.000+0000,2012-04-09T00:00:00.000+0000,2021-01-21T00:00:00.000+0000,2022-05-02T00:00:00.000+0000,2020-06-04T00:00:00.000+0000,2021-04-29T00:00:00.000+0000,6,2015-11-24T00:00:00.000+0000,2019-06-17T00:00:00.000+0000,2012-09-12T00:00:00.000+0000,2015-03-09T00:00:00.000+0000,2018-05-01T00:00:00.000+0000,Inc,test
8,2015-09-05T00:00:00.000+0000,2019-11-16T00:00:00.000+0000,magazine,25755640,74437,carry,"Strickland, Smith and Nielsen",2021-05-14T00:00:00.000+0000,2017-04-12T00:00:00.000+0000,2010-09-10T00:00:00.000+0000,2017-07-02T00:00:00.000+0000,2019-09-26T00:00:00.000+0000,2021-11-25T00:00:00.000+0000,2013-02-15T00:00:00.000+0000,7,2021-09-25T00:00:00.000+0000,2020-01-20T00:00:00.000+0000,2014-04-22T00:00:00.000+0000,2023-04-15T00:00:00.000+0000,2022-06-05T00:00:00.000+0000,and Sons,wonder
9,2017-03-24T00:00:00.000+0000,2012-06-15T00:00:00.000+0000,certainly,52192388,18865,first,"Colon, Hunter and Morgan",2014-12-04T00:00:00.000+0000,2011-03-04T00:00:00.000+0000,2012-07-04T00:00:00.000+0000,2012-10-09T00:00:00.000+0000,2013-07-17T00:00:00.000+0000,2019-04-19T00:00:00.000+0000,2013-08-13T00:00:00.000+0000,8,2020-08-06T00:00:00.000+0000,2015-08-14T00:00:00.000+0000,2016-01-30T00:00:00.000+0000,2013-12-06T00:00:00.000+0000,2017-08-25T00:00:00.000+0000,Group,end
10,2011-06-17T00:00:00.000+0000,2011-12-22T00:00:00.000+0000,financial,7250583,18826,because,Thomas-Arnold,2018-11-19T00:00:00.000+0000,2015-06-03T00:00:00.000+0000,2017-01-30T00:00:00.000+0000,2022-04-14T00:00:00.000+0000,2023-09-02T00:00:00.000+0000,2011-07-21T00:00:00.000+0000,2022-03-19T00:00:00.000+0000,9,2013-01-28T00:00:00.000+0000,2018-10-01T00:00:00.000+0000,2019-04-14T00:00:00.000+0000,2021-03-11T00:00:00.000+0000,2018-06-11T00:00:00.000+0000,Ltd,room
