**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("SalesDataProcessing")\
        .getOrCreate()
spark

**Create Dataframe**

In [0]:
from pyspark.sql import Row
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]
df_sales= spark.createDataFrame(data)
df_sales.printSchema()
df_sales.show(truncate=False)

root
 |-- OrderID: long (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Items: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)
 |-- Region: string (nullable = true)
 |-- Amount: double (nullable = true)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+----

**Working with JSON & Nested Fields**

In [0]:
#1.Flatten Items using explode()
from pyspark.sql.functions import explode, col  
df_exploded = df_sales.withColumn("Item", explode("Items")) \
                      .withColumn("Product", col("Item.Product")) \
                      .withColumn("Qty", col("Item.Qty").cast("double")) \
                      .drop("Items", "Item")
print("Schema after flattening:")
df_exploded.show(truncate=False)
df_exploded.printSchema()
#2.Count total quantity sold per product
from pyspark.sql.functions import sum
print("Total quantity sold per product:")
df_exploded.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()
#3.Count number of orders per region
print("Number of orders per region:")
df_sales.groupBy("Region").count().withColumnRenamed("count", "OrderCount").show()

Schema after flattening:
+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|101    |Ali     |Asia  |1200.0|Laptop |1.0|
|101    |Ali     |Asia  |1200.0|Mouse  |2.0|
|102    |Zara    |Europe|650.0 |Tablet |1.0|
|103    |Mohan   |Asia  |890.0 |Phone  |2.0|
|103    |Mohan   |Asia  |890.0 |Charger|1.0|
|104    |Sara    |US    |450.0 |Desk   |1.0|
+-------+--------+------+------+-------+---+

root
 |-- OrderID: long (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Product: string (nullable = true)
 |-- Qty: double (nullable = true)

Total quantity sold per product:
+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|     1.0|
|  Mouse|     2.0|
| Tablet|     1.0|
|  Phone|     2.0|
|Charger|     1.0|
|   Desk|     1.0|
+-------+--------+

Number of orders per region:
+------+----------+
|Region|OrderCount

**Using when and otherwise**

In [0]:
#4.Create a new column HighValueOrder
from pyspark.sql.functions import when
print("HighValueOrder column:")
df_sales = df_sales.withColumn("HighValueOrder", when(col("Amount") > 1000, "Yes").otherwise("No"))
df_sales.select("OrderID", "Amount", "HighValueOrder").show()
#5.Add a column ShippingZone
df_sales = df_sales.withColumn("ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Unknown")
)
print("ShippingZone column:")
df_sales.select("Region", "ShippingZone").distinct().show()

HighValueOrder column:
+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+

ShippingZone column:
+------+------------+
|Region|ShippingZone|
+------+------------+
|  Asia|      Zone A|
|Europe|      Zone B|
|    US|      Zone C|
+------+------------+



**Temporary & Permanent Views**

In [0]:
#6.Register df_sales as a temporary view named 
df_sales.createOrReplaceTempView("sales_view")
#7.SQL Queries
#Count orders by region
print("Orders by region:")
spark.sql("SELECT Region, COUNT(*) as OrderCount FROM sales_view GROUP BY Region").show()
#Average amount per region
print("Average amount per region:")
spark.sql("SELECT Region, AVG(Amount) as AvgAmount FROM sales_view GROUP BY Region").show()
#8.Create a permanent view using saveAsTable() 
df_sales.write.mode("overwrite").saveAsTable("sales_permanent_view")

Orders by region:
+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+

Average amount per region:
+------+---------+
|Region|AvgAmount|
+------+---------+
|  Asia|   1045.0|
|Europe|    650.0|
|    US|    450.0|
+------+---------+



**SQL Queries via Spark**

In [0]:
#9.Filter orders with more than 1 item
print("Orders with more than 1 item:")
spark.sql("SELECT * FROM sales_view WHERE size(Items) > 1").show()
#10.Customers with Amount > 800
print("Customers with Amount > 800:")
spark.sql("SELECT Customer, Amount FROM sales_view WHERE Amount > 800").show()

Orders with more than 1 item:
+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
+-------+--------+--------------------+------+------+--------------+------------+

Customers with Amount > 800:
+--------+------+
|Customer|Amount|
+--------+------+
|     Ali|1200.0|
|   Mohan| 890.0|
+--------+------+



**Saving as Parquet and Reading Again**

In [0]:
#11.Save exploded DataFrame as Parquet partitioned by Region
df_exploded.write.mode("overwrite").partitionBy("Region").parquet("/mnt/parquet/sales_by_region")


In [0]:
#12.Read Parquet & Group by Product
parquet = spark.read.parquet("/mnt/parquet/sales_by_region")
parquet.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()

+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|     2.0|
|Charger|     1.0|
| Laptop|     1.0|
|  Mouse|     2.0|
| Tablet|     1.0|
|   Desk|     1.0|
+-------+--------+

