In [1]:
from pyspark.sql import SparkSession
from datetime import datetime


In [2]:
spark = SparkSession.builder.appName("myAPP").getOrCreate()

# Raw Data

In [3]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]


# Clean Data

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [5]:
df = pd.DataFrame(orders_data, columns=[
    "OrderID","CustomerID","City","Category","Product","Amount","OrderDate","Status"
])

In [6]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [7]:
df["Amount"] = pd.to_numeric(df["Amount"], errors="coerce")

In [8]:
df["Amount"].fillna(df["Amount"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Amount"].fillna(df["Amount"].median(), inplace=True)


In [13]:

df["Amount"] = df["Amount"].astype(int)

In [14]:
df["OrderDate"] = pd.to_datetime(df["OrderDate"], errors="coerce", dayfirst=True)

In [15]:
df = df.drop_duplicates()

In [16]:
schema = StructType([
    StructField("OrderID", StringType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Amount", IntegerType(), True),
    StructField("OrderDate", DateType(), True),
    StructField("Status", StringType(), True)
])

In [17]:
spark_df = spark.createDataFrame(df, schema=schema)

In [18]:
spark_df.show()

+-------+----------+---------+-----------+-----------+------+----------+---------+
|OrderID|CustomerID|     City|   Category|    Product|Amount| OrderDate|   Status|
+-------+----------+---------+-----------+-----------+------+----------+---------+
| ORD001|      C001|    Delhi|Electronics|     Laptop| 45000|2024-05-01|Completed|
| ORD002|      C002|   Mumbai|Electronics|     Mobile| 32000|      NULL|Completed|
| ORD003|      C003|Bangalore|Electronics|     Tablet| 30000|      NULL|Completed|
| ORD004|      C004|    Delhi|Electronics|     Laptop| 35500|2024-07-01|Cancelled|
| ORD005|      C005|  Chennai|Electronics|     Mobile| 35500|2024-08-01|Completed|
| ORD006|      C006|   Mumbai|       Home|      Mixer| 35500|2024-08-01|Completed|
| ORD007|      C001|    Delhi|Electronics|     Laptop| 47000|      NULL|Completed|
| ORD008|      C007|Bangalore|       Home|     Vacuum| 28000|2024-09-01|Completed|
| ORD009|      C002|   Mumbai|Electronics|     Laptop| 55000|2024-10-01|Completed|
| OR

# Keep Only Completed Orders

In [19]:
df = df[df["Status"] == "Completed"]

# Verify No Nulls in Key Columns

In [20]:
null_check = df[["OrderID", "Amount", "OrderDate"]].isnull().sum()
print("Null check:\n", null_check)

Null check:
 OrderID      0
Amount       0
OrderDate    9
dtype: int64


In [23]:
spark_df = spark_df.dropna(subset=["OrderDate"])

In [22]:
null_check = df[["OrderID", "Amount", "OrderDate"]].isnull().sum()
print("Null check:\n", null_check)

Null check:
 OrderID      0
Amount       0
OrderDate    0
dtype: int64


# Total Revenue per City




In [24]:
from pyspark.sql import functions as F

revenue_city = spark_df.groupBy("City").agg(F.sum("Amount").alias("TotalRevenue"))
revenue_city.show()

+---------+------------+
|     City|TotalRevenue|
+---------+------------+
|Bangalore|       82000|
|  Chennai|       35500|
|   Mumbai|      119500|
|    Delhi|      144500|
+---------+------------+



# Total revenue per category

In [25]:
revenue_category = spark_df.groupBy("Category").agg(F.sum("Amount").alias("TotalRevenue"))
revenue_category.show()


+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|       Home|      151500|
|Electronics|      230000|
+-----------+------------+



# Total Revenue per Product


In [26]:
revenue_product = spark_df.groupBy("Product").agg(F.sum("Amount").alias("TotalRevenue"))
revenue_product.show()

+-----------+------------+
|    Product|TotalRevenue|
+-----------+------------+
|     Vacuum|       57000|
|AirPurifier|       38000|
|     Laptop|      135500|
|      Mixer|       56500|
|     Mobile|       68500|
|     Tablet|       26000|
+-----------+------------+



# Average Order Value per City

In [27]:

avg_order_city = spark_df.groupBy("City").agg(F.avg("Amount").alias("AvgOrderValue"))
avg_order_city.show()

+---------+------------------+
|     City|     AvgOrderValue|
+---------+------------------+
|Bangalore|27333.333333333332|
|  Chennai|           35500.0|
|   Mumbai|39833.333333333336|
|    Delhi|           36125.0|
+---------+------------------+



# Identify Top 3 Products by Revenue


In [28]:
top_products = (spark_df.groupBy("Product")
                .agg(F.sum("Amount").alias("TotalRevenue"))
                .orderBy(F.desc("TotalRevenue"))
                .limit(3))
top_products.show()


+-------+------------+
|Product|TotalRevenue|
+-------+------------+
| Laptop|      135500|
| Mobile|       68500|
| Vacuum|       57000|
+-------+------------+



# Windows

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Rank Cities by Total Revenue

In [30]:
city_revenue = spark_df.groupBy("City").agg(F.sum("Amount").alias("TotalRevenue"))

city_window = Window.orderBy(F.desc("TotalRevenue"))

ranked_cities = city_revenue.withColumn("Rank", F.rank().over(city_window))
ranked_cities.show()


+---------+------------+----+
|     City|TotalRevenue|Rank|
+---------+------------+----+
|    Delhi|      144500|   1|
|   Mumbai|      119500|   2|
|Bangalore|       82000|   3|
|  Chennai|       35500|   4|
+---------+------------+----+



# Rank Products Within Each Category by Revenue

In [31]:
product_revenue = spark_df.groupBy("Category","Product").agg(F.sum("Amount").alias("TotalRevenue"))

product_window = Window.partitionBy("Category").orderBy(F.desc("TotalRevenue"))

ranked_products = product_revenue.withColumn("Rank", F.dense_rank().over(product_window))
ranked_products.show()

+-----------+-----------+------------+----+
|   Category|    Product|TotalRevenue|Rank|
+-----------+-----------+------------+----+
|Electronics|     Laptop|      135500|   1|
|Electronics|     Mobile|       68500|   2|
|Electronics|     Tablet|       26000|   3|
|       Home|     Vacuum|       57000|   1|
|       Home|      Mixer|       56500|   2|
|       Home|AirPurifier|       38000|   3|
+-----------+-----------+------------+----+



# cache

In [32]:
spark_df.cache()
spark_df.count()

11

# Run Multiple Aggregations and Observe Behavior

In [33]:
revenue_city = spark_df.groupBy("City").agg(F.sum("Amount").alias("TotalRevenue"))
revenue_category = spark_df.groupBy("Category").agg(F.sum("Amount").alias("TotalRevenue"))
avg_order_city = spark_df.groupBy("City").agg(F.avg("Amount").alias("AvgOrderValue"))

revenue_city.show()
revenue_category.show()
avg_order_city.show()


+---------+------------+
|     City|TotalRevenue|
+---------+------------+
|Bangalore|       82000|
|  Chennai|       35500|
|   Mumbai|      119500|
|    Delhi|      144500|
+---------+------------+

+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|       Home|      151500|
|Electronics|      230000|
+-----------+------------+

+---------+------------------+
|     City|     AvgOrderValue|
+---------+------------------+
|Bangalore|27333.333333333332|
|  Chennai|           35500.0|
|   Mumbai|39833.333333333336|
|    Delhi|           36125.0|
+---------+------------------+



# Explain

In [34]:
revenue_city.explain(True)

== Parsed Logical Plan ==
'Aggregate ['City], ['City, 'sum('Amount) AS TotalRevenue#457]
+- Filter atleastnnonnulls(1, OrderDate#6)
   +- LogicalRDD [OrderID#0, CustomerID#1, City#2, Category#3, Product#4, Amount#5, OrderDate#6, Status#7], false

== Analyzed Logical Plan ==
City: string, TotalRevenue: bigint
Aggregate [City#2], [City#2, sum(Amount#5) AS TotalRevenue#457L]
+- Filter atleastnnonnulls(1, OrderDate#6)
   +- LogicalRDD [OrderID#0, CustomerID#1, City#2, Category#3, Product#4, Amount#5, OrderDate#6, Status#7], false

== Optimized Logical Plan ==
Aggregate [City#2], [City#2, sum(Amount#5) AS TotalRevenue#457L]
+- Project [City#2, Amount#5]
   +- InMemoryRelation [OrderID#0, CustomerID#1, City#2, Category#3, Product#4, Amount#5, OrderDate#6, Status#7], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- *(1) Filter atleastnnonnulls(1, OrderDate#6)
            +- *(1) Scan ExistingRDD[OrderID#0,CustomerID#1,City#2,Category#3,Product#4,Amount#5,OrderDate#6,Status#7]


# Write cleaned order-level data to Parquet

In [35]:
spark_df.write.mode("overwrite").parquet("output/cleaned_orders_parquet")

# ORC

In [36]:
from pyspark.sql import functions as F

revenue_city = spark_df.groupBy("City").agg(F.sum("Amount").alias("TotalRevenue"))
revenue_city.write.mode("overwrite").orc("output/revenue_city_orc")

#