In [1]:
from pyspark.sql import SparkSession
from datetime import datetime


In [2]:
spark = SparkSession.builder.appName("myAPP").getOrCreate()

# Raw Data

In [1]:
import pandas as pd

csv_df = pd.read_csv("orders_large_bad.csv", low_memory=False)
json_df = pd.read_json("orders_large_bad.json", lines=True)

df = pd.concat([csv_df, json_df], ignore_index=True)

In [3]:

df.columns = df.columns.str.strip().str.lower()

df["order_id"] = pd.to_numeric(df["order_id"], errors="coerce")
df["customer_id"] = pd.to_numeric(df["customer_id"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")

df = df.dropna(subset=["order_id", "customer_id", "order_date"])

df.to_csv("orders_clean.csv", index=False)

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

spark = SparkSession.builder \
    .appName("OrdersDataPipeline") \
    .getOrCreate()

schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("order_date", DateType(), True),
    StructField("status", StringType(), True)
])

orders_df = spark.read.csv("orders_clean.csv", header=True, schema=schema)

orders_df.printSchema()
orders_df.show(5)

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

+--------+-----------+----+--------+-------+------+----------+------+
|order_id|customer_id|city|category|product|amount|order_date|status|
+--------+-----------+----+--------+-------+------+----------+------+
+--------+-----------+----+--------+-------+------+----------+------+



In [5]:
orders_df = orders_df.dropDuplicates(["order_id"])

from pyspark.sql.functions import trim, lower
orders_df = orders_df.withColumn("city", trim(lower(orders_df["city"]))) \
                     .withColumn("category", trim(lower(orders_df["category"]))) \
                     .withColumn("status", trim(lower(orders_df["status"])))


orders_df.write.mode("overwrite").parquet("orders_parquet")

In [12]:
orders_df = orders_df.withColumn(
    "amount_int",
    when(col("amount").rlike("^[0-9]+$"), col("amount").cast(IntegerType())).otherwise(None)
)


In [13]:
from pyspark.sql.functions import sum as _sum, avg, max, min, col, when

# Revenue summary by category
revenue_by_category = orders_df.groupBy("category").agg(
    _sum(col("amount_int")).alias("total_revenue"),
    avg(col("amount_int")).alias("avg_revenue"),
    max(col("amount_int")).alias("max_revenue"),
    min(col("amount_int")).alias("min_revenue")
)

revenue_by_category.show(truncate=False)


+--------+-------------+-----------+-----------+-----------+
|category|total_revenue|avg_revenue|max_revenue|min_revenue|
+--------+-------------+-----------+-----------+-----------+
+--------+-------------+-----------+-----------+-----------+



In [14]:
revenue_by_category.explain(True)


== Parsed Logical Plan ==
'Aggregate ['category], ['category, 'sum('amount_int) AS total_revenue#157, 'avg('amount_int) AS avg_revenue#158, 'max('amount_int) AS max_revenue#159, 'min('amount_int) AS min_revenue#160]
+- Project [order_id#0, customer_id#1, city#42, category#43, product#4, amount#5, order_date#6, status#44, CASE WHEN RLIKE(cast(amount#5 as string), ^[0-9]+$) THEN cast(amount#5 as int) ELSE cast(null as int) END AS amount_int#156]
   +- Project [order_id#0, customer_id#1, city#42, category#43, product#4, amount#5, order_date#6, trim(lower(status#7), None) AS status#44]
      +- Project [order_id#0, customer_id#1, city#42, trim(lower(category#3), None) AS category#43, product#4, amount#5, order_date#6, status#7]
         +- Project [order_id#0, customer_id#1, trim(lower(city#2), None) AS city#42, category#3, product#4, amount#5, order_date#6, status#7]
            +- Deduplicate [order_id#0]
               +- Relation [order_id#0,customer_id#1,city#2,category#3,product#4,am