In [0]:
%pip install dbldatagen

Python interpreter will be restarted.
Collecting dbldatagen
  Using cached dbldatagen-0.3.5-py3-none-any.whl (86 kB)
Installing collected packages: dbldatagen
Successfully installed dbldatagen-0.3.5
Python interpreter will be restarted.


In [0]:
%pip install Faker

Python interpreter will be restarted.
Collecting Faker
  Using cached Faker-19.6.2-py3-none-any.whl (1.7 MB)
Installing collected packages: Faker
Successfully installed Faker-19.6.2
Python interpreter will be restarted.


In [0]:
from dbldatagen import DataGenerator, PyfuncText, DateRange
from faker import Faker
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("DataGeneration").getOrCreate()

# Define lists for certain columns
brand_values = ["Brand1", "Brand2", "Brand3", "Brand4", "Brand5"]
courier_company_values = ["CourierCo1", "CourierCo2", "CourierCo3", "CourierCo4", "CourierCo5"]
channel_name_values = ["Channel1", "Channel2", "Channel3", "Channel4", "Channel5"]

# Define the number of rows and partitions for data generation
partitions_requested = 2
data_rows = 10

# Define the schema for the synthetic data
schema = StructType([
    StructField("_id", IntegerType(), False),
    StructField("Customer_id", IntegerType(), True),
    StructField("location_id", StringType(), True),
    StructField("shopity_user_rd", StringType(), True),
    StructField("Channel_id", IntegerType(), True),
    StructField("abandoned_url", StringType(), True),
    StructField("Company", StringType(), True),
    StructField("Brand", StringType(), True),
    StructField("Sub_brand", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Sub_Category", StringType(), True),
    StructField("Cart_token", StringType(), True),
    StructField("Closed_at", TimestampType(), True),
    StructField("device_id", StringType(), True),
    StructField("email_id", StringType(), True),
    StructField("gateway", StringType(), True),
    StructField("launching_site_url", StringType(), True),
    StructField("Source", StringType(), True),
    StructField("Shipping_rate_id", IntegerType(), True),
    StructField("Shipping_rate_Price", IntegerType(), True),
    StructField("Shipping_rate_title", StringType(), True),
    StructField("source_identifier", StringType(), True),
    StructField("source_name", StringType(), True),
    StructField("Source_url", StringType(), True),
    StructField("Shipping_line", StringType(), True),
    StructField("Taxes_included", IntegerType(), True),
    StructField("Token", StringType(), True),
    StructField("Total_discount", IntegerType(), True),
    StructField("total_line_item_price", IntegerType(), True),
    StructField("total_price", IntegerType(), True),
    StructField("Updated_at", TimestampType(), True),
    StructField("applied_discount", StringType(), True),
    StructField("Shipping_address", StringType(), True),
])

# Create a DataGenerator for synthetic data generation
shipping_data_generator = (
    DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
    .withSchema(schema)
    .withColumnSpec("_id", minValue=1, maxValue=5000, step=1)
    .withColumnSpec("Customer_id", random=True, text=PyfuncText(lambda context, v: str(random.randint(1, 5))))
    .withColumnSpec("location_id", random=True, text=PyfuncText(lambda context, v: str(random.randint(10, 120))))
    .withColumnSpec("shopity_user_rd", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("Channel_id", random=True, text=PyfuncText(lambda context, v: str(random.randint(1, 5))))
    .withColumnSpec("abandoned_url", text=PyfuncText(lambda context, v: Faker().url()))
    .withColumnSpec("Company", text=PyfuncText(lambda context, v: Faker().company()))
    .withColumnSpec("Brand", text=PyfuncText(lambda context, v: random.choice(brand_values)))
    .withColumnSpec("Sub_brand", text=PyfuncText(lambda context, v: Faker().company_suffix()))
    .withColumnSpec("Category", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("Sub_Category", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("Cart_token", text=PyfuncText(lambda context, v: Faker().uuid4()))
    .withColumnSpec("Closed_at", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("device_id", text=PyfuncText(lambda context, v: Faker().uuid4()))
    .withColumnSpec("email_id", text=PyfuncText(lambda context, v: Faker().email()))
    .withColumnSpec("gateway", text=PyfuncText(lambda context, v: Faker().word()))
    .withColumnSpec("launching_site_url", text=PyfuncText(lambda context, v: Faker().url()))
    .withColumnSpec("Source", text=PyfuncText(lambda context, v: random.choice(channel_name_values)))
    .withColumnSpec("Shipping_rate_id", random=True, text=PyfuncText(lambda context, v: str(random.randint(1, 100))))
    .withColumnSpec("Shipping_rate_Price", random=True, text=PyfuncText(lambda context, v: str(random.randint(1, 1000))))
    .withColumnSpec("Shipping_rate_title", text=PyfuncText(lambda context, v: Faker().sentence()))
    .withColumnSpec("source_identifier", text=PyfuncText(lambda context, v: Faker().uuid4()))
    .withColumnSpec("source_name", text=PyfuncText(lambda context, v: random.choice(channel_name_values)))
    .withColumnSpec("Source_url", text=PyfuncText(lambda context, v: Faker().url()))
    .withColumnSpec("Shipping_line", text=PyfuncText(lambda context, v: Faker().sentence()))
    .withColumnSpec("Taxes_included", random=True, text=PyfuncText(lambda context, v: str(random.randint(0, 1))))
    .withColumnSpec("Token", text=PyfuncText(lambda context, v: Faker().uuid4()))
    .withColumnSpec("Total_discount", random=True, text=PyfuncText(lambda context, v: str(random.randint(0, 100))))
    .withColumnSpec("total_line_item_price", random=True, text=PyfuncText(lambda context, v: str(random.randint(10, 1000))))
    .withColumnSpec("total_price", random=True, text=PyfuncText(lambda context, v: str(random.randint(20, 10000))))
    .withColumnSpec("Updated_at", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("applied_discount", random=True, text=PyfuncText(lambda context, v: str(random.randint(10, 10000))))
    .withColumnSpec("Shipping_address", text=PyfuncText(lambda context, v: Faker().address()))
)

# Build the synthetic data DataFrame
shipping_data = shipping_data_generator.build()

# Display the first 10 rows of the generated data
display(shipping_data.limit(10))


_id,Customer_id,location_id,shopity_user_rd,Channel_id,abandoned_url,Company,Brand,Sub_brand,Category,Sub_Category,Cart_token,Closed_at,device_id,email_id,gateway,launching_site_url,Source,Shipping_rate_id,Shipping_rate_Price,Shipping_rate_title,source_identifier,source_name,Source_url,Shipping_line,Taxes_included,Token,Total_discount,total_line_item_price,total_price,Updated_at,applied_discount,Shipping_address
1,0,103,history,0,http://robinson.com/,Gibson-Mason,Brand5,Inc,real,senior,1771ce68-c372-4a5a-8e22-fbff97f59293,2010-09-14T00:00:00.000+0000,41ce5cdf-53ae-45c5-a874-ffc09c6c8473,markkennedy@example.net,support,http://martinez.com/,Channel3,0,0,Suggest trade series.,ee2a895b-dfd8-4a76-8b83-00fbd4843455,Channel2,https://neal.com/,Local shoulder great just focus identify into.,0,3bea2d8a-67a0-4a7a-be84-6b667c5bb3e9,0,0,0,2020-04-30T00:00:00.000+0000,7970,"88134 Pearson Street Port Lisa, HI 65471"
2,1,19,late,1,http://hawkins.biz/,Jones Ltd,Brand4,Ltd,with,or,3c34d0e5-eca6-4619-b570-40b69725064f,2012-01-12T00:00:00.000+0000,f36000d4-8e86-4eb7-addc-4e0ae5f760df,riddleglen@example.com,administration,http://www.mcgee.com/,Channel5,1,1,Difficult Mr director national woman office.,97da0324-6e54-43b7-84fa-af5f9c6306b3,Channel5,http://mccullough.com/,Campaign only situation enough might leave personal.,1,96d586a0-65db-4d13-aa8c-2843edb12d3b,1,1,1,2016-08-15T00:00:00.000+0000,476,"4931 Shelby Drive Suite 085 North Jamesmouth, NE 68206"
3,2,33,outside,2,https://pruitt.com/,"Smith, Gonzalez and Bishop",Brand1,PLC,cultural,side,95576a98-a37c-4ea8-9af9-b368c1c01b8d,2016-02-26T00:00:00.000+0000,efe7d5be-e773-4161-bbd0-626f7050890d,kclark@example.com,image,http://valdez.net/,Channel3,2,2,Nature relate free evening effort.,4b021c0f-2ca0-4c33-b818-1a7f5afe73a5,Channel5,https://stevens.net/,Girl present gun reality.,2,3f6720d5-7fbd-491e-8fb5-b85a4f68f793,2,2,2,2013-03-15T00:00:00.000+0000,4972,USNS Murphy FPO AE 10455
4,3,48,day,3,https://www.gomez.com/,Carr Group,Brand3,LLC,series,step,b3147328-add2-49dc-a862-e296e0cb1078,2013-09-01T00:00:00.000+0000,9fb05342-6adc-44c1-a43c-071df3418fa2,smiller@example.org,price,https://www.white.org/,Channel3,3,3,Culture to join authority remember news produce different.,4c6ad389-f4e3-4dd6-ac70-0dc09ea47fbd,Channel4,https://gonzalez.com/,Event trip magazine central.,3,054cbf4f-4eda-43d2-aa45-038bfcec4b7d,3,3,3,2011-11-26T00:00:00.000+0000,9356,"86796 Huynh Parks Apt. 524 South Ian, AL 82794"
5,4,70,today,4,https://www.morris.com/,Barry Inc,Brand4,Inc,any,almost,f3da3d71-2b5d-48fd-91dc-8db42bbe3b9d,2018-05-29T00:00:00.000+0000,ee1230ec-7008-41fe-a9af-5d233ff285b9,sball@example.com,community,http://fernandez-drake.com/,Channel5,4,4,Item buy capital toward like idea laugh.,8394197f-ffbd-4603-ad7d-728e6dc3bf37,Channel1,http://williams.com/,Able such include activity cut when business.,4,e80a0ba3-ac4c-4131-afe5-6f9214ea6b84,4,4,4,2021-09-27T00:00:00.000+0000,8973,"508 Samuel Knoll Suite 826 Christianhaven, FM 49449"
6,5,26,drug,5,http://conrad.com/,Bishop Group,Brand1,Ltd,nation,will,448f8d92-7a9c-42be-ac28-a8229078df35,2013-02-20T00:00:00.000+0000,b5baabf3-4692-4640-89b9-5c76de994a8f,jcollins@example.org,name,http://www.henry.com/,Channel1,5,5,According on course country blood.,04e1c331-1784-4b50-b0ce-d18b522b2e5f,Channel3,https://manning-bennett.com/,By move yet movie best scene.,5,41a6f2d6-399d-4ebf-bfce-3f5d80f31af2,5,5,5,2018-05-01T00:00:00.000+0000,7046,"11371 Schmidt Avenue East Zacharyborough, WA 60625"
7,6,83,daughter,6,http://www.ellis.com/,"Campos, Gordon and Lee",Brand2,Inc,network,special,1e49b44e-64d6-484d-984e-ad4633c5a28b,2011-08-05T00:00:00.000+0000,421704cd-73b0-4711-96ff-f55e993b84ef,cjackson@example.com,even,https://thomas-williams.com/,Channel3,6,6,Allow sound possible remain despite argue.,b0dfe874-8587-4528-ab6b-95c8f03f90a0,Channel2,https://www.carr.net/,Fight represent main really interesting resource.,6,4531a741-07ba-437c-a25e-c671d4805956,6,6,6,2022-06-12T00:00:00.000+0000,4331,"48313 Thompson Point Suite 939 South Steven, PA 52150"
8,7,86,pretty,7,http://carrillo.com/,Saunders Inc,Brand5,PLC,imagine,everything,30d93e6e-e855-4b42-abdb-52b0bfa4c6ae,2011-01-24T00:00:00.000+0000,9a580044-b082-45e4-b366-c929f792ce95,sandra66@example.com,difficult,http://www.henry.info/,Channel4,7,7,Spend I gas little peace medical happen.,42e995ea-291e-4139-8fbc-ac298647cc11,Channel2,https://www.hodge.com/,Power standard style leader home century.,7,e120f065-8934-47c3-87d7-1447e2af87fd,7,7,7,2018-07-07T00:00:00.000+0000,8593,"5213 Morgan Grove Apt. 304 Alexisberg, MI 44729"
9,8,93,stop,8,https://alvarado-brown.com/,Kennedy PLC,Brand5,PLC,choose,early,7d220c46-a918-4e48-8396-ea853dfd94c7,2013-01-23T00:00:00.000+0000,ef3327ef-7ecc-4cd9-a252-9e00bd683d30,brownwilliam@example.net,push,https://www.anthony.com/,Channel4,8,8,Science career small in spring will mention.,5857430e-7028-4a38-9efa-c9011cf0198b,Channel3,http://mcclain.info/,Class establish real professional play action.,8,8b0ebbc6-adaa-4fea-92c0-76a8c842564e,8,8,8,2021-09-30T00:00:00.000+0000,3964,"624 Rodriguez Terrace Jenniferfurt, IL 81138"
10,9,107,small,9,http://rangel-brown.org/,Mathews PLC,Brand4,Group,shoulder,east,43a8de8d-f7e4-461d-bba7-0d77420da760,2023-04-03T00:00:00.000+0000,17f8d11d-71c1-4d7a-9e83-fc83740f488a,ncastaneda@example.net,soldier,http://www.collier.com/,Channel4,9,9,Power protect later experience several capital.,4d71991c-d729-4106-87c9-bcaadae18ec3,Channel3,https://www.martin-cain.org/,Enter save speak career.,9,1910a525-3444-4351-a784-447df2e89a65,9,9,9,2023-03-11T00:00:00.000+0000,5019,USNS Mcclain FPO AE 04824
