In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, count

spark = SparkSession.builder \
    .appName("Customer Orders Analysis") \
    .getOrCreate()

df_customers = spark.read.csv("customers(1).csv", header=True, inferSchema=True)
df_orders = spark.read.csv("orders(1).csv", header=True, inferSchema=True)

In [3]:
df_customers.printSchema()
df_customers.show()
df_orders.printSchema()
df_orders.show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   D

In [6]:
from pyspark.sql.functions import col
df_orders=df_orders.withColumn("total_amount", col("Quantity")*col("Price"))

In [9]:
df_joined=df_orders.join(df_customers,on="CustomerID")

In [13]:
df_high_value = df_joined.filter(col("total_amount") > 20000)
df_high_value.show()


+----------+-------+-------+--------+-----+----------+------------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|total_amount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+------------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|       70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|       50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+------------+-----+------+---+



In [14]:
df_orders.groupBy("CustomerID").count().filter("count > 1").show()

+----------+-----+
|CustomerID|count|
+----------+-----+
|       101|    2|
+----------+-----+



In [16]:
df_joined.groupBy("City").agg(avg("total_amount").alias("AvgOrderValue")).show()

+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|   Mumbai|      36500.0|
|    Delhi|      50000.0|
|Hyderabad|      12000.0|
+---------+-------------+



In [17]:
df_joined.orderBy(col("OrderDate").desc()).show()

+----------+-------+-------+--------+-----+----------+------------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|total_amount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+------------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|       12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|        3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|       10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|       50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|       70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+------------+-----+---------+---+



In [18]:
df_joined.write.mode("overwrite").partitionBy("City").parquet("/content/customer_orders_parquet")


In [19]:
df_joined.createOrReplaceTempView("customer_orders")

In [23]:
from pyspark.sql.functions import col, expr, count, avg, sum, desc
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession

spark.sql("""
SELECT Name, SUM(total_amount) as TotalSales
FROM customer_orders
GROUP BY Name
ORDER BY TotalSales DESC
""").show()

+-----+----------+
| Name|TotalSales|
+-----+----------+
|Aditi|     73000|
|Rohan|     50000|
|Kabir|     12000|
|Meena|     10000|
+-----+----------+



In [24]:
spark.sql("""
SELECT City, COUNT(Product) as ProductCount
FROM customer_orders
GROUP BY City
""").show()


+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+



In [26]:
spark.sql("""
SELECT City, SUM(total_amount) as Revenue
FROM customer_orders
GROUP BY City
ORDER BY Revenue DESC
LIMIT 2
""").show()

+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

