In [3]:
import os
import pandas as pd
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when # pyspark sql functions
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType, TimestampType, DecimalType # pyspark sql types
import random
from faker import Faker
import uuid
from dotenv import load_dotenv
import boto3
import io

In [4]:
# Load environment variables from .env file
load_dotenv()

# SPARK SETUP for AWS GLUE
#    #hive enables for spark to use catalog
#    #spark.hadoop.fs.s3.impl is required to use s3a:// paths, s3a is more efficient and suitable for accesing s3 in aws glue
#    #spark.serializer is required to use KryoSerializer, it is faster than default serializer
#    #spark.sql.catalogImplementation is required to use hive catalog, to track metadata

spark = SparkSession.builder \
    .appName("AWS Glue PySpark Synthetic") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate() # Create a SparkSession, which is the entry point to any Spark functionality

# Initialize Faker
fake = Faker()

# Generate synthetic product data, this would be the raw data, extract phase, ingestion phase.
# can be replaced with data warehouse or data lake creation credentials
def generate_synthetic_product_data(num_records=1000):
    data = []
    product_ids = [fake.uuid4() for _ in range(num_records)]
    for product_id in product_ids:
        data.append({
            'product_id': product_id,
            'product_name': fake.word(),
            'product_category': random.choice(['Electronics', 'Clothing', 'Books', 'Home & Kitchen']),
            'product_price': round(random.uniform(10.0, 1000.0), 2),
            'product_description': fake.sentence(),
        })
    return data, product_ids

# Generate synthetic payment data
def generate_synthetic_payment_data(num_records=1000, product_ids=None):
    if product_ids is None or len(product_ids) < num_records:
        raise ValueError("Not enough product IDs provided")
    
    data = []
    for _ in range(num_records): 
        data.append({
            'transaction_id': fake.uuid4(),
            'customer_id': fake.uuid4(),
            'customer_name': fake.name(),
            'payment_amount': round(random.uniform(10.0, 1000.0), 2),
            'payment_method': random.choice(['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']),
            'transaction_date': fake.date_this_year(),
            'country': fake.country(),
            'product_id': random.choice(product_ids),
        })
    return data

# Generate 1000 records of product data first
product_data, product_ids = generate_synthetic_product_data(1000)

# Convert product data to Pandas DataFrame before converting to Spark DataFrame
product_df = pd.DataFrame(product_data)
# Convert to Spark DataFrame
spark_product_df = spark.createDataFrame(product_df)

# Write product data to S3
# spark_product_df.write.mode('overwrite').parquet('s3://your-bucket-name/path/to/output/product/')

# Generate 1000 records of payment data using the product IDs
payment_data = generate_synthetic_payment_data(1000, product_ids)

# Convert payment data to Pandas DataFrame before converting to Spark DataFrame
payment_df = pd.DataFrame(payment_data)
# Convert to Spark DataFrame
spark_payment_df = spark.createDataFrame(payment_df)

# Write payment data to S3 in Parquet format
# spark_payment_df.write.mode('overwrite').parquet('s3://your-bucket-name/path/to/output/payment/')


In [5]:
# Convert product data to Pandas DataFrame before converting to Spark DataFrame
product_df = pd.DataFrame(product_data)
# Convert to Spark DataFrame
spark_product_df = spark.createDataFrame(product_df)

# Convert payment data to Pandas DataFrame before converting to Spark DataFrame
payment_df = pd.DataFrame(payment_data)
# Convert to Spark DataFrame
spark_payment_df = spark.createDataFrame(payment_df)

# Repartition the DataFrames so that Spark can handle them efficiently
spark_product_df = spark_product_df.repartition(4)
spark_payment_df = spark_payment_df.repartition(4)

In [6]:
# check data types
spark_payment_df.dtypes
# double is a float in pyspark

[('transaction_id', 'string'),
 ('customer_id', 'string'),
 ('customer_name', 'string'),
 ('payment_amount', 'double'),
 ('payment_method', 'string'),
 ('transaction_date', 'date'),
 ('country', 'string'),
 ('product_id', 'string')]