In [1]:
import os
import pandas as pd
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg, when # pyspark sql functions
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType, TimestampType, DecimalType # pyspark sql types
import random
from faker import Faker
import uuid
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
 

# SPARK SETUP
spark = SparkSession.builder.appName("Jupyter PySpark Example").getOrCreate() # Create a SparkSession, which is the entry point to any Spark functionality

# Initialize Faker
fake = Faker()

# Generate synthetic payment data, this would be the raw data, extract phase, ingestion phase.
# can be replaced with data warehouse or data lake creation credentials which would means we skip the extract phase
def generate_synthetic_data(num_records=1000):
    data = []
    for _ in range(num_records): 
        data.append({
            'transaction_id': fake.uuid4(),
            'customer_id': fake.uuid4(),
            'customer_name': fake.name(),
            'payment_amount': round(random.uniform(10.0, 1000.0), 2),
            'payment_method': random.choice(['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']),
            'transaction_date': fake.date_this_year(),
            'country': fake.country()
        })
    return data

In [2]:
# Generate 1000 records of payment data
payment_data = generate_synthetic_data(1000)

# Convert to Pandas DataFrame before converting to spark dataframe,
#  repartition the dataframe so that spark can handle it
payment_df = pd.DataFrame(payment_data)
# Convert to spark dataframe
spark_payment_df = spark.createDataFrame(payment_df)

# the dataframe is divided into 2 partitions, spark is a distributed system
spark_payment_df = spark_payment_df.repartition(2) 

In [5]:
# Display the schema
spark_payment_df.printSchema() # print the schema of the dataframe

# Display the summary
spark_payment_df.describe().show()


root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- payment_amount: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- country: string (nullable = true)

+-------+--------------------+--------------------+----------------+------------------+--------------+-----------+
|summary|      transaction_id|         customer_id|   customer_name|    payment_amount|payment_method|    country|
+-------+--------------------+--------------------+----------------+------------------+--------------+-----------+
|  count|                1000|                1000|            1000|              1000|          1000|       1000|
|   mean|                NULL|                NULL|            NULL|499.65305000000006|          NULL|       NULL|
| stddev|                NULL|                NULL|            NULL|284.52596935083534|          NULL|      

In [4]:
# Display the first 10 records
display(spark_payment_df.limit(10))


DataFrame[transaction_id: string, customer_id: string, customer_name: string, payment_amount: double, payment_method: string, transaction_date: date, country: string]

[Row(transaction_id='ac3a043b-1f57-4577-970e-f85e747b26f0', customer_id='1d63e865-46a6-4b29-97cb-5e0cbcf01175', customer_name='Justin Rice', payment_amount=707.18, payment_method='Credit Card', transaction_date=datetime.date(2024, 5, 28), country='Italy'),
 Row(transaction_id='d3c75bd4-1bb2-4e53-8c08-56c36b1f20aa', customer_id='2f45f2be-94d4-4ef6-b78e-567b28902f26', customer_name='Denise Robles', payment_amount=348.13, payment_method='Debit Card', transaction_date=datetime.date(2024, 5, 24), country='Ireland'),
 Row(transaction_id='47a3ed21-5174-40a4-b216-6ee5ff691529', customer_id='c59be281-efd3-4c6e-98d8-8020e724c28c', customer_name='Christopher Santana', payment_amount=743.98, payment_method='Credit Card', transaction_date=datetime.date(2024, 6, 28), country='Burkina Faso'),
 Row(transaction_id='ad68da94-72a0-4a82-be89-3d0d058a8ccb', customer_id='b9c05b5c-e99e-4f01-bab6-73e6126bd4dc', customer_name='Jordan Bennett', payment_amount=455.01, payment_method='Debit Card', transaction_dat

In [7]:
# check data types
spark_payment_df.dtypes


[('transaction_id', 'string'),
 ('customer_id', 'string'),
 ('customer_name', 'string'),
 ('payment_amount', 'double'),
 ('payment_method', 'string'),
 ('transaction_date', 'date'),
 ('country', 'string')]

In [9]:
# Strip time from the transaction_date, keeping only the date
payment_df['transaction_date'] = pd.to_datetime(payment_df['transaction_date']).dt.normalize()

# Convert the pandas DataFrame back to a Spark DataFrame
spark_payment_df = spark.createDataFrame(payment_df)



In [13]:
display(spark_payment_df.limit(10))

DataFrame[transaction_id: string, customer_id: string, customer_name: string, payment_amount: double, payment_method: string, transaction_date: timestamp, country: string]

DataFrame[transaction_id: string, customer_id: string, customer_name: string, payment_amount: double, payment_method: string, transaction_date: timestamp, country: string]