In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from datetime import datetime
from pyspark.sql.window import Window as W
import matplotlib.pyplot as plt

In [0]:
spark

#**Spark Tasks**

In [0]:
# 1. Ingest the CSV files into two PySpark DataFrames
df_cus = spark.read.csv(r"file:/Workspace/Shared/jun-13/customers.csv", header=True, inferSchema=True)
df_ord = spark.read.csv(r"file:/Workspace/Shared/jun-13/orders.csv", header=True, inferSchema=True)

In [0]:
# 2. Infer schema and print the schema for both
print(df_cus.printSchema())
print(df_ord.printSchema())

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

None
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)

None


In [0]:
# 3. Add a column TotalAmount = Quantity * Price to orders
df_ord = df_ord.withColumn("TotalAmount", df_ord.Quantity * df_ord.Price)
df_ord.show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 4. Join both DataFrames on CustomerID
df_joined = df_cus.join(df_ord, on="CustomerID", how="inner")
df_joined.show()

+----------+-----+---------+---+-------+-------+--------+-----+----------+-----------+
|CustomerID| Name|     City|Age|OrderID|Product|Quantity|Price| OrderDate|TotalAmount|
+----------+-----+---------+---+-------+-------+--------+-----+----------+-----------+
|       101|Aditi|   Mumbai| 28|   1001| Laptop|       1|70000|2024-01-05|      70000|
|       102|Rohan|    Delhi| 35|   1002| Mobile|       2|25000|2024-02-10|      50000|
|       103|Meena|Bangalore| 41|   1003|   Desk|       1|10000|2024-03-15|      10000|
|       101|Aditi|   Mumbai| 28|   1004|  Mouse|       3| 1000|2024-04-01|       3000|
|       104|Kabir|Hyderabad| 30|   1005|Monitor|       1|12000|2024-04-25|      12000|
+----------+-----+---------+---+-------+-------+--------+-----+----------+-----------+



In [0]:
# 5. Filter orders where TotalAmount > 20000
df_ord.filter(df_ord.TotalAmount > 20_000).show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 6. Show customers who placed more than 1 order
df_joined.groupby("CustomerID").agg(
    F.count("OrderID").alias("OrderCount")
).filter(F.col("OrderCount") > 1).show()

+----------+----------+
|CustomerID|OrderCount|
+----------+----------+
|       101|         2|
+----------+----------+



In [0]:
# 7. Group orders by City and get average order value
df_joined.groupBy("City").agg(
    F.mean("TotalAmount").alias("AverageOrderValue")
).show()

+---------+-----------------+
|     City|AverageOrderValue|
+---------+-----------------+
|Bangalore|          10000.0|
|   Mumbai|          36500.0|
|    Delhi|          50000.0|
|Hyderabad|          12000.0|
+---------+-----------------+



In [0]:
# 8. Sort orders by OrderDate in descending order
df_ord.sort(df_ord.OrderDate, ascending=False).show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 9. Write the final result as a Parquet file partitioned by City
df_joined.write.mode("overwrite").parquet(r"file:/Workspace/Shared/jun-13/df_joined", partitionBy="City")

In [0]:
# 10. Create a temporary view and run Spark SQL:
df_joined.createOrReplaceTempView("df_joined")

In [0]:
# 10.1 Total sales by customer
spark.sql("""
          SELECT Name, SUM(TotalAmount) AS Sales FROM df_joined
          GROUP BY CustomerID, Name
          """).show()

+-----+-----+
| Name|Sales|
+-----+-----+
|Aditi|73000|
|Rohan|50000|
|Meena|10000|
|Kabir|12000|
+-----+-----+



In [0]:
# 10.2 Count of products per city
spark.sql("""
          SELECT City, COUNT(DISTINCT(Product)) AS ProductCount FROM df_joined
          GROUP BY City
          """).show()

+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+



In [0]:
# 10.3 Top 2 cities by revenue
spark.sql("""
          SELECT City, SUM(TotalAmount) AS Revenue FROM df_joined
          GROUP BY City
          ORDER BY SUM(TotalAmount) DESC
          LIMIT 2
          """).show()

+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

