### Transform Customers Data using PySpark


In [0]:
%sql
select * from gizmobox.bronze.py_customers;

In [0]:
### Write the above result to a DF
df = spark.sql('SELECT * FROM gizmobox.bronze.py_customers');
display(df)

In [0]:
### Using Spark 
df = spark.table('gizmobox.bronze.py_customers')  ### You can use spark.read.table('table_name').options()
display(df)

In [0]:
### Remove records with null customer_id

df_filtered = df.filter(df.customer_id.isNotNull()) ## or df.filter('customer_id is not null') or df.where(df.customer_id.isNotNull())
display(df)

In [0]:
### Remove exact duplicate records
df_distinct = df_filtered.distinct()
display(df_distinct)

In [0]:
df_distinct = df_filtered.dropDuplicates()
display(df_distinct)

In [0]:
### To remove duplicates based on created_timestamp latest

from pyspark.sql import functions as F
df_max_timestamp = df_distinct.groupBy('customer_id') \
    .agg(F.max('created_timestamp').alias('max_created_timestamp'))

display(df_max_timestamp)



In [0]:
### Joining 2 dataframes

df_distinct_customer =  (
    df_distinct.join(df_max_timestamp, (df_distinct.customer_id == df_max_timestamp.customer_id) 
                     & 
                    (df_distinct.created_timestamp == df_max_timestamp.max_created_timestamp), 'inner')
                .select(df_distinct['*'])
)

display(df_distinct_customer)

In [0]:
### CAST functions to change datatypes of columns

df_casted_customer = (
    df_distinct_customer
    .select(df_distinct_customer.created_timestamp.cast("timestamp"),
            df_distinct_customer.customer_id.cast("int"),
            df_distinct_customer.customer_name,
            df_distinct_customer.date_of_birth.cast("date"),
            df_distinct_customer.email,
            df_distinct_customer.member_since.cast("date"),
            df_distinct_customer.telephone.cast("string"),

))
display(df_casted_customer)
    

In [0]:
#### 5. Write Data to a DT

df_casted_customer.writeTo("gizmobox.silver.py_customers").createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_customers;

### Transforming Payments Data

In [0]:
df = spark.table("gizmobox.bronze.py_payments")
display(df)

In [0]:
### 1. Extract Date and time from timestamp col
from pyspark.sql import functions as F
df_extract_payments = (
    df.select(
        'order_id',
        'payment_id',
        F.date_format('payment_timestamp', 'yyyy-MM-dd').cast('date').alias('payment_date'),
        F.date_format('payment_timestamp', 'HH-mm-ss').alias('payment_time'),
        'payment_status',
        'payment_method'

    )
)

display(df_extract_payments)

In [0]:
### CASE Statement equivalent for payment status in PySpark --> WHEN

df_payment_status = (
    df_extract_payments.select(
        'payment_id',
        'order_id',
        'payment_date',
        'payment_time',
        F.when(df_extract_payments.payment_status == 1, 'Success')
        .when(df_extract_payments.payment_status == 2, 'Pending')
        .when(df_extract_payments.payment_status == 4, 'Failed')
        .when(df_extract_payments.payment_status == 3, 'Cancelled')
        .alias('payment_status'),
        'payment_method'
    )
    )

display(df_payment_status)



In [0]:
### Write to silver Table

df_payment_status.writeTo("gizmobox.silver.py_payments").createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_payments;

### Transform Memberships Data png files

In [0]:
df_memberships = spark.table("gizmobox.bronze.py_memberships")
display(df_memberships)

In [0]:
#### Extract customer_id from path

df_membership_cust_id = df_memberships.select(
    F.regexp_extract(df_memberships.path, r".*/([0-9]+)\.png$",1).alias("customer_id"),
    F.col("content").alias("membership_card"))

display(df_membership_cust_id)


In [0]:
df_membership_cust_id.writeTo("gizmobox.silver.py_membership").createOrReplace()

In [0]:
%sql
select * from gizmobox.silver.py_membership;

### Transform Addresses Data

In [0]:
#### Denormalizing the shipping and billing address in the addresses data ---> pivot

df_address = spark.table("gizmobox.bronze.py_addresses")
display(df_address)

In [0]:
from pyspark.sql import functions as F
df_pivoted_address = (
    df_address.groupBy('customer_id').pivot('address_type', ['billing', 'shipping'])
    .agg(
        F.max('address_line_1').alias('address_line_1'),
        #F.max('address_line_2').alias('address_line_2'),
        F.max('city').alias('city'),
        F.max('state').alias('state'),
        F.max('postcode').alias('postcode')
    )
)

display(df_pivoted_address)

In [0]:
### writing data to table
df_pivoted_address.writeTo("gizmobox.silver.py_address").createOrReplace()



#### Transform Orders Data

In [0]:
#### JSON strings
df_orders = spark.table("gizmobox.bronze.py_orders")
display(df_orders)


In [0]:
### Pre-process the data
from pyspark.sql import functions as F
df_fixed_orders = df_orders.select(
    F.regexp_replace("value", '"order_date: (\\d{4}-\\d{2}-\\d{2})"', "'order_date': '$1'").alias("value")
)

display(df_fixed_orders) 


In [0]:
### Converting JSON string to JSON object from_json, schema_of_json

df_with_schema = (
    df_fixed_orders.select(
        F.schema_of_json(F.col("value")).alias("schema")

    )
)
display(df_with_schema.limit(1))

In [0]:
orders_schema = """STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>"""

In [0]:
df_json_orders = (
    df_fixed_orders.select(
        F.from_json("value",orders_schema).alias("json_value")
    )
)
display(df_json_orders)

In [0]:
df_json_orders.writeTo("gizmobox.silver.py_orders_json").createOrReplace()

In [0]:
df_orders = spark.table("gizmobox.silver.py_orders_json")
display(df_orders)

In [0]:
### Access elements from json object
from pyspark.sql import functions as F

df_orders_normlaized = (
    df_orders.select(
        "json_value.customer_id",
        "json_value.order_id",
        "json_value.order_status",
        "json_value.payment_method",
        "json_value.total_amount",
        "json_value.order_date",
        "json_value.transaction_timestamp",
        "json_value.items"

    )
)
display(df_orders_normlaized)
#df_orders_normlaized.writeTo("gizmobox.silver.py_orders").createOrReplace()

In [0]:
### Deduplicate Array Elements

df_orders_normalized = df_orders.select(
        "json_value.customer_id",
        "json_value.order_id",
        "json_value.order_status",
        "json_value.payment_method",
        "json_value.total_amount",
        "json_value.order_date",
        "json_value.transaction_timestamp",
        F.array_distinct("json_value.items").alias("items")

    )

display(df_orders_normalized)

In [0]:
#### Exploding Arrays

df_orders_exploded = (
    df_orders_normalized.select(
        "customer_id",
        "order_id",
        "order_status",
        "payment_method",
        "total_amount",
        "order_date",
        "transaction_timestamp",
        F.explode("items").alias("item")
    )
)
display(df_orders_exploded)

In [0]:
df_order_items = df_orders_exploded.select(
    "customer_id",
    "order_id",
    "order_status",
    "payment_method",
    "total_amount",
    "order_date",
    "transaction_timestamp",
    "item.item_id",
    "item.name",
    "item.price",
    "item.quantity",
    "item.category",
    "item.details.brand",
    "item.details.color"
)
display(df_order_items)


In [0]:
df_order_items.writeTo("gizmobox.silver.py_orders_items").createOrReplace()

#### JOIN customer and Address for Gold Layer

In [0]:
df_customers = spark.table('gizmobox.silver.py_customers')

df_address = spark.table('gizmobox.silver.py_address')

In [0]:
df_customer_address= (
    df_customers.join(df_address, "customer_id", "inner")
    .select("customer_id",
            "customer_name",
            "date_of_birth",
            "email",
            "member_since",
            "telephone",
            "billing_address_line_1",
            "billing_city",
            "billing_state",
            "billing_postcode",
            "shipping_address_line_1",
            "shipping_city",
            "shipping_state")
)

display(df_customer_address)

In [0]:
df_customer_address.writeTo("gizmobox.gold.py_customer_address").createOrReplace()

#### Month Order Summary aggregations

In [0]:
df_orders = spark.table('gizmobox.silver.py_orders_items')
df_customers = spark.table('gizmobox.silver.py_customers')

In [0]:
df_order_summary = (
    df_orders.withColumn("order_month", F.date_format('transaction_timestamp', "yyyy-MM"))
    .groupBy('order_month', 'customer_id')
    .agg(
        F.countDistinct('order_id').alias('total_orders'),
        F.sum('quantity').alias('total_items_bought'),
        F.sum(F.col('price') * F.col('quantity')).alias('total_revenue')
    )
)


display(df_order_summary)


In [0]:
df_order_summary.writeTo("gizmobox.gold.py_order_summary").createOrReplace()