In [None]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


In [None]:
spark = SparkSession.builder \
    .appName("Local PySpark ETL") \
    .master("local[*]") \
    .getOrCreate()

spark

### Extracting data from landing_zone

In [None]:
df_orders = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/orders/19980505/orders_initial.csv")

df_categories = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/categories/categories.csv")

df_customers = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/customers/customers.csv")

df_orders_details = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/orders_details/orders_details.csv")

df_products = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/products/products.csv")

df_suppliers = spark.read.format("csv") \
    .option("header", "true") \
    .option("encoding", "UTF-8") \
    .load("data/landing_zone/suppliers/suppliers.csv")

df_products.show(5)
df_products.printSchema()

### RAW ZONE

In [None]:
# Rename columns for join
df_orders_raw = df_orders.withColumnRenamed("orderid", "order_id")
df_orders_details_raw = df_orders_details.withColumnRenamed("orderid", "order_id")
df_products_raw = df_products.withColumnRenamed("productid", "product_id")
df_categories_raw = df_categories.withColumnRenamed("categoryid", "category_id")
df_customers_raw = df_customers.withColumnRenamed("customerid", "customer_id")

In [None]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType, DoubleType

df_orders_raw = df_orders_raw \
    .withColumn("order_id", col("order_id").cast(IntegerType())) \
    .withColumn("employeeid", col("employeeid").cast(IntegerType())) \
    .withColumn("orderdate", to_date("orderdate", "yyyy-MM-dd")) \
    .withColumn("requireddate", to_date("requireddate", "yyyy-MM-dd")) \
    .withColumn("shippeddate", to_date("shippeddate", "yyyy-MM-dd"))

df_orders_details_raw = df_orders_details_raw \
    .withColumn("productid", col("productid").cast(IntegerType())) \
    .withColumn("unitprice", col("unitprice").cast(DoubleType())) \
    .withColumn("quantity", col("quantity").cast(IntegerType())) \
    .withColumn("discount", col("discount").cast(DoubleType()))


In [None]:

raw_orders_path = r"D:\Study-By_Myself-Knowledge\PySpark_pro\AWS_PySpark_Workshop\local\data-pipeline-with-PySpark\data\raw_zone\orders_raw"
raw_order_details_path = r"D:\Study-By_Myself-Knowledge\PySpark_pro\AWS_PySpark_Workshop\local\data-pipeline-with-PySpark\data\raw_zone\orders_details_raw"

os.makedirs(raw_orders_path, exist_ok=True)
os.makedirs(raw_order_details_path, exist_ok=True)

# Spark -> Pandas
orders_pdf = df_orders_raw.toPandas()
order_details_pdf = df_orders_details_raw.toPandas()

orders_pdf.to_csv(
    os.path.join(raw_orders_path, "orders_raw.csv"),
    index=False,
    encoding="utf-8-sig"
)
order_details_pdf.to_csv(
    os.path.join(raw_order_details_path, "orders_details_raw.csv"),
    index=False,
    encoding="utf-8-sig"
)


### SERVING ZONE

In [None]:
# join orders + order_details
df_fact_orders_items = df_orders_raw.join(
    df_orders_details_raw,
    on="order_id",
    how="inner"
)
df_fact_orders_items.show()

In [None]:
from pyspark.sql.functions import year, month, dayofmonth

df_fact_orders_items = df_fact_orders_items \
    .withColumn("year", year("orderdate")) \
    .withColumn("month", month("orderdate")) \
    .withColumn("day", dayofmonth("orderdate")) \
    .withColumn("year_be", year("orderdate") + 543)
df_fact_orders_items.show()


In [None]:
df_fact_orders_items.printSchema()


In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType, DateType

df_serving = df_fact_orders_items \
    .withColumn("order_id", col("order_id").cast(IntegerType())) \
    .withColumn("employeeid", col("employeeid").cast(IntegerType())) \
    .withColumn("shipvia", col("shipvia").cast(IntegerType())) \
    .withColumn("freight", col("freight").cast(DoubleType())) \
    .withColumn("productid", col("productid").cast(IntegerType())) \
    .withColumn("unitprice", col("unitprice").cast(DoubleType())) \
    .withColumn("quantity", col("quantity").cast(IntegerType())) \
    .withColumn("discount", col("discount").cast(DoubleType()))



In [None]:
#Add Sales Metrics
df_serving = df_serving \
    .withColumn("gross_sales", col("unitprice") * col("quantity")) \
    .withColumn(
        "net_sales",
        col("unitprice") * col("quantity") * (1 - col("discount"))
    ) \
    .withColumn(
        "discount_amount",
        col("unitprice") * col("quantity") * col("discount")
    )
df_serving.show()


In [None]:
from pyspark.sql.functions import when
df_serving = df_serving.withColumn(
    "has_discount",
    when(col("discount") > 0, "Y").otherwise("N")
)
df_serving.show()


In [None]:
#Shipping Intelligence
df_serving = df_serving.withColumn(
    "shipping_type",
    when(col("freight") > 50, "HIGH_COST")
    .when(col("freight") > 20, "MEDIUM_COST")
    .otherwise("LOW_COST")
)
df_serving.show()


In [None]:
df_serving= df_serving.withColumn(
    "order_size",
    when(col("quantity") < 10, "SMALL")
    .when(col("quantity") < 30, "MEDIUM")
    .otherwise("LARGE")
)
df_serving.show()

In [None]:
from pyspark.sql.functions import col, date_format, to_date

df_serving = df_serving.withColumn(
    "order_date",
    to_date(col("orderdate"))
).withColumn(
    "month_name",
    date_format(col("order_date"), "MMM")
)
df_serving.show()

### Export file

In [None]:

df_serving = df_serving.withColumn(
    "order_size",
    when(col("quantity") < 10, "SMALL")
    .when(col("quantity") < 30, "MEDIUM")
    .otherwise("LARGE")
)

serving_folder = r"D:\Study-By_Myself-Knowledge\PySpark_pro\AWS_PySpark_Workshop\local\data-pipeline-with-PySpark\data\serving_zone"
os.makedirs(serving_folder, exist_ok=True)

pdf = df_serving.toPandas()

pdf.to_csv(
    os.path.join(serving_folder, "fact_sale_orders_items.csv"),
    index=False,
    encoding="utf-8-sig"
)
