In [0]:
%pip install dbldatagen

Python interpreter will be restarted.
Collecting dbldatagen
  Using cached dbldatagen-0.3.5-py3-none-any.whl (86 kB)
Installing collected packages: dbldatagen
Successfully installed dbldatagen-0.3.5
Python interpreter will be restarted.


In [0]:
%pip install Faker

Python interpreter will be restarted.
Collecting Faker
  Using cached Faker-19.6.2-py3-none-any.whl (1.7 MB)
Installing collected packages: Faker
Successfully installed Faker-19.6.2
Python interpreter will be restarted.


In [0]:
from dbldatagen import DataGenerator, PyfuncText, DateRange
from faker import Faker
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType
from datetime import datetime, timedelta
from pyspark.sql.functions import expr

# Create a SparkSession
spark = SparkSession.builder.appName("DataGeneration").getOrCreate()

# Define the number of rows and partitions for data generation
partitions_requested = 2
data_rows = 10

# Define the schema for the synthetic data
schema = StructType([    
    StructField("Index", IntegerType(), True),  # New columns
    StructField("amount", StringType(), True),
    StructField("Code", StringType(), True),
    StructField("Created_At", TimestampType(), True),
    StructField("type", StringType(), True),
    StructField("updated_at", TimestampType(), True),
    StructField("Checkout_id", IntegerType(), True)
])

# Custom function to generate updated_at >= created_at
def generate_updated_at(created_at):
    max_days_difference = (datetime(2023, 12, 31) - created_at).days
    updated_at = created_at + timedelta(days=random.randint(0, max_days_difference))
    return updated_at

# Create a DataGenerator for synthetic data generation
item_data_generator = (
    DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
    .withSchema(schema)
    .withColumnSpec("Index", minValue=1, maxValue=100, step=1)  # Example values, adjust as needed
    .withColumnSpec("amount", random=True, text=PyfuncText(lambda context, v: str(round(random.uniform(100, 1000), 2))))  # Example values, adjust as needed
    .withColumnSpec("Code", text=PyfuncText(lambda context, v: Faker().word()))  # Example values, adjust as needed
    .withColumnSpec("Created_At", "timestamp", data_range=DateRange("2010-01-01 00:00:00", "2023-12-31 23:59:59", "days=1"), random=True)
    .withColumnSpec("type", text=PyfuncText(lambda context, v: random.choice(["Type1", "Type2", "Type3"])))  # Example values, adjust as needed
    .withColumnSpec("Checkout_id", minValue=1000, maxValue=2000, step=1)  # Example values, adjust as needed
)

# Build the synthetic data DataFrame
item_data = item_data_generator.build()

# Generate updated_at values
item_data = item_data.withColumn("updated_at", expr("date_add(created_at, 1)"))

# Display the first 10 rows of the generated data
display(item_data)


Index,amount,Code,Created_At,type,updated_at,Checkout_id
1,872.87,most,2010-11-09T00:00:00.000+0000,Type1,2010-11-10,1000
2,670.96,range,2018-10-27T00:00:00.000+0000,Type1,2018-10-28,1001
3,807.63,drug,2011-01-06T00:00:00.000+0000,Type3,2011-01-07,1002
4,559.03,evidence,2010-02-25T00:00:00.000+0000,Type1,2010-02-26,1003
5,353.29,me,2016-09-27T00:00:00.000+0000,Type1,2016-09-28,1004
6,848.56,middle,2013-09-19T00:00:00.000+0000,Type3,2013-09-20,1005
7,341.79,positive,2016-02-03T00:00:00.000+0000,Type3,2016-02-04,1006
8,991.97,if,2018-09-10T00:00:00.000+0000,Type1,2018-09-11,1007
9,355.04,mother,2010-05-15T00:00:00.000+0000,Type2,2010-05-16,1008
10,381.69,fear,2011-08-18T00:00:00.000+0000,Type2,2011-08-19,1009
