### ETL with PySpark

#### Extract Data from the customers JSON file

In [0]:
# Reading the Data
df =spark.read.format('json').load("/Volumes/gizmobox/Landing/operational_data/customers/customers_2024_*.json")
display(df)

In [0]:
df = spark.read.format('json').load("/Volumes/gizmobox/Landing/operational_data/customers")
display(df)

### Or
df_simple = spark.read.json("/Volumes/gizmobox/Landing/operational_data/customers")
display(df_simple)

In [0]:
#### Selecting the metadata of the file
df_metadata = df_simple.select("_metadata.file_path", "*")
display(df_metadata)

In [0]:
#### Create a Table in bronze schema

#### Version-1 API to write the data to a table (not recommended)
df_metadata.write.format("delta").mode("overwrite").saveAsTable("gizmobox.bronze.py_customers")


In [0]:
#### Version-2 API to write the data to a table (recommended)
df_metadata.writeTo("gizmobox.bronze.py_customers").createOrReplace()

In [0]:
%sql
select * from gizmobox.bronze.py_customers;

### Query Orders File Using JSON format

#### Reading the data of Orders file


In [0]:

df = spark.read.json("/Volumes/gizmobox/Landing/operational_data/orders")
display(df)


In [0]:
#### Reading Orders file using TEXT Format as we've corrupt records
df = spark.read.text("/Volumes/gizmobox/Landing/operational_data/orders")
display(df)

In [0]:
#### Write Orders view on Bronze Schema

df.writeTo("gizmobox.bronze.py_orders").createOrReplace()


In [0]:
%sql
select * from gizmobox.bronze.py_orders;

### Reading and Querying Memberships Folder - Binary data

In [0]:
%fs ls '/Volumes/gizmobox/Landing/operational_data/memberships'

In [0]:
df = spark.read.format('binaryFile').load("/Volumes/gizmobox/Landing/operational_data/memberships/*/*.png")
display(df)

In [0]:
df.writeTo("gizmobox.bronze.py_memberships").createOrReplace()

In [0]:
%sql
select * from gizmobox.bronze.py_memberships;

### Read Data from Addresses File

In [0]:
df = spark.read.csv("/Volumes/gizmobox/Landing/operational_data/addresses")
display(df)

In [0]:
### CSV files might have a different delimiter and header columns to address those problems you use read_files function in sql, let's see in pyspark
df = (
  spark.read.format('csv')
  .option("header", True)
  .option("delimiter", "\t")
  .load("/Volumes/gizmobox/Landing/operational_data/addresses")
)

display(df)


In [0]:
### write to Table

df.writeTo("gizmobox.bronze.py_addresses").createOrReplace()

In [0]:
%sql
select * from gizmobox.bronze.py_addresses;

### Reading Payments file

In [0]:
payment_schema = 'payment_id INTEGER, order_id INTEGER, payment_timestamp TIMESTAMP, payment_status INTEGER, payment_method STRING'

In [0]:
### Other method using Struct

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

py_payment_schema = StructType([
    StructField('payment_id', IntegerType(), True),
    StructField('order_id', IntegerType(), True),
    StructField('payment_timestamp', TimestampType(), True),
    StructField('payment_status', IntegerType(), True),
    StructField('payment_method', StringType(), True)
])

In [0]:
### This file doesn't have headers hence we need to create a schema


df = (
    spark.read.format('csv')
    .option('delimiter', ",")
    .schema(payment_schema)
    .load("abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing/external_data/payments")
)
display(df)


In [0]:
#### Writing to the table
df.writeTo("gizmobox.bronze.py_payments").createOrReplace()

In [0]:
%sql
select * from gizmobox.bronze.py_payments;