In [88]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

In [2]:
spark = SparkSession.builder.appName("deo").enableHiveSupport().getOrCreate()

In [3]:
spark

# **Data Ingestion & Exploration**

In [4]:
# 1.1 Load both CSV files with schema inference.
df_customers = spark.read.csv(r"/content/customers.csv", header=True, inferSchema=True)
df_orders = spark.read.csv(r"/content/orders.csv", header=True, inferSchema=True)

In [11]:
# 1.2 List all columns and data types.
spark.createDataFrame(df_customers.dtypes, ["columns", "datatypes"],).show()

spark.createDataFrame(df_orders.dtypes, ["columns", "datatypes"],).show()

+----------+---------+
|   columns|datatypes|
+----------+---------+
|CustomerID|      int|
|      Name|   string|
|     Email|   string|
|      City|   string|
|SignupDate|     date|
+----------+---------+

+----------+---------+
|   columns|datatypes|
+----------+---------+
|   OrderID|      int|
|CustomerID|      int|
|   Product|   string|
|  Category|   string|
|  Quantity|      int|
|     Price|   double|
| OrderDate|     date|
+----------+---------+



In [13]:
# 1.3 Count the total number of customers and orders.
print(f"Total number of customer - {df_customers.count()}")
print(f"Total number of orders - {df_orders.count()}")

Total number of customer - 5
Total number of orders - 7


In [14]:
# 1.4 Show distinct cities.
df_customers.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



# **DataFrame Transformations**

In [17]:
# 2.1 Add a column TotalAmount = Price * Quantity .
df_orders = df_orders.withColumn("TotalAmount", df_orders.Price * df_orders.Quantity)
df_orders.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [16]:
# 2.2 Create a new column OrderYear from OrderDate .
df_orders.withColumn("OrderYear", F.year("OrderDate")).show()

+-------+----------+---------+-----------+--------+-------+----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|     2024|
+-------+----------+---------+-----------+--------+-------+----------+---------+



In [18]:
# 2.3 Filter orders with TotalAmount > 10,000 .
df_orders.filter(df_orders.TotalAmount > 10_000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+



In [19]:
# 2.4 Drop the Email column from customers .
df_customers.drop("Email").show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



# **Handling Nulls & Conditionals**

In [22]:
# 3.1 Simulate a null in City and fill it with “Unknown”.
df_customers_null = df_customers.withColumn("City", F.when(df_customers.City == "Mumbai", None).otherwise(df_customers.City))

df_customers_null.fillna("Unknown", subset="City").show()


+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|  Unknown|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



In [25]:
# 3.2 Label customers as “Loyal” if SignupDate is before 2022, else “New”.
df_customers.withColumn(
    "CustomerLabel", F.when((F.year("SignupDate") <= 2022), "Loyal").otherwise("New")
    ).show()

+----------+-----+-----------------+---------+----------+-------------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerLabel|
+----------+-----+-----------------+---------+----------+-------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|        Loyal|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|          New|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|        Loyal|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|        Loyal|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|          New|
+----------+-----+-----------------+---------+----------+-------------+



In [26]:
# 3.3 Create OrderType column: "Low" if < 5,000, "High" if ≥ 5,000.
df_orders.withColumn(
    "OrderType",
    F.when(df_orders.TotalAmount < 5_000, "Low").otherwise("High")
).show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderType|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|      Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     High|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|      Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     High|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|      Low|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     High|
+-------+----------+---------+-----------+--------+-------+----------+

# **Joins & Aggregations**

In [29]:
# 4.1 Join customers and orders on CustomerID .
df_joined = df_customers.join(df_orders, on="CustomerID", how="inner")
df_joined.show()

+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+
|CustomerID| Name|            Email|     City|SignupDate|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|      5|    Mixer| Appliances|       1| 5000.0|

In [35]:
# 4.2 Get total orders and revenue per city.
df_joined.groupBy("City").agg(
      F.count("OrderID").alias("TotalOrders"),
      F.sum("TotalAmount").alias("RevenuePerCity")
).show()

+---------+-----------+--------------+
|     City|TotalOrders|RevenuePerCity|
+---------+-----------+--------------+
|Bangalore|          1|        3500.0|
|  Chennai|          1|        2500.0|
|   Mumbai|          2|      101200.0|
|    Delhi|          2|       50000.0|
|Hyderabad|          1|        5000.0|
+---------+-----------+--------------+



In [40]:
# 4.3 Show top 3 customers by total spend.
df_spend = df_joined.groupBy("CustomerID").agg(
    F.sum("TotalAmount").alias("Spend")
)
df_spend.sort(df_spend.Spend.desc()).show(3)

+----------+--------+
|CustomerID|   Spend|
+----------+--------+
|       101|101200.0|
|       102| 50000.0|
|       104|  5000.0|
+----------+--------+
only showing top 3 rows



In [42]:
# 4.4 Count how many products each category has sold.
df_joined.groupBy("Category").agg(
    F.sum("Quantity").alias("ProductsSold")
).show()

+-----------+------------+
|   Category|ProductsSold|
+-----------+------------+
| Stationery|           5|
|Electronics|           5|
|  Furniture|           1|
| Appliances|           1|
+-----------+------------+



# **Spark SQL Tasks**

In [44]:
# 5.1 Create database sales and switch to it.
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

DataFrame[]

In [45]:
# 5.2 Save both datasets as tables in the sales database.
df_customers.write.mode("overwrite").saveAsTable("sales.customers")
df_orders.write.mode("overwrite").saveAsTable("sales.orders")

In [48]:
# 5.3.1 List all orders by customers from “Delhi”.
spark.sql("""
  SELECT * FROM customers
  WHERE City = 'Delhi'""").show()

+----------+----+--------------+-----+----------+
|CustomerID|Name|         Email| City|SignupDate|
+----------+----+--------------+-----+----------+
|       102|Neha|neha@yahoo.com|Delhi|2023-01-15|
+----------+----+--------------+-----+----------+



In [50]:
# 5.3.2 Find average order value in each category.
spark.sql("""
  SELECT Category, AVG(TotalAmount) AS AverageOrderValue FROM orders
  GROUP BY Category
""").show()

+-----------+-----------------+
|   Category|AverageOrderValue|
+-----------+-----------------+
| Stationery|           2500.0|
|Electronics|          37800.0|
|  Furniture|           3500.0|
| Appliances|           5000.0|
+-----------+-----------------+



In [57]:
# 5.3.3 Create a view monthly_orders with month-wise total amount.
spark.sql("""
  SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS MonthWiseTotal FROM orders
  GROUP BY Month
  """).createOrReplaceTempView("month_wise")

spark.sql("SELECT * FROM month_wise").show()

+-----+--------------+
|Month|MonthWiseTotal|
+-----+--------------+
|    1|      101200.0|
|    3|       32500.0|
|    2|       28500.0|
+-----+--------------+



# **String & Date Functions**

In [63]:
# 6.1 Mask emails using regex (e.g., a***@gmail.com ).
spark.sql("""
  SELECT Email, CONCAT(
    SUBSTR(Email, 1, 1),
    REPEAT('*', LENGTH(SPLIT(Email, '@')[0])-1),
    '@',
    SPLIT(Email, '@')[1]
    ) AS Masked FROM customers

""").show()

+-----------------+-----------------+
|            Email|           Masked|
+-----------------+-----------------+
|    ali@gmail.com|    a**@gmail.com|
|   neha@yahoo.com|   n***@yahoo.com|
| ravi@hotmail.com| r***@hotmail.com|
|sneha@outlook.com|s****@outlook.com|
|   amit@gmail.com|   a***@gmail.com|
+-----------------+-----------------+



In [65]:
# 6.2 Concatenate Name and City as “Name from City”.
spark.sql("""
  SELECT CONCAT(Name, ' from ', City) AS Bio FROM customers
""").show()

+--------------------+
|                 Bio|
+--------------------+
|     Ali from Mumbai|
|     Neha from Delhi|
| Ravi from Bangalore|
|Sneha from Hyderabad|
|   Amit from Chennai|
+--------------------+



In [69]:
# 6.3 Use datediff() to calculate customer age in days.
spark.sql("""
  SELECT Name, DATEDIFF(CURRENT_DATE(), SignupDate) AS CustomerAge FROM customers
""").show()

+-----+-----------+
| Name|CustomerAge|
+-----+-----------+
|  Ali|       1126|
| Neha|        876|
| Ravi|       1316|
|Sneha|       1783|
| Amit|        822|
+-----+-----------+



In [71]:
# 6.4 Extract month name from OrderDate .
spark.sql("""
  SELECT OrderDate, MONTH(OrderDate) AS OrderMonth FROM orders
""").show()

+----------+----------+
| OrderDate|OrderMonth|
+----------+----------+
|2024-01-10|         1|
|2024-01-15|         1|
|2024-02-01|         2|
|2024-02-10|         2|
|2024-02-15|         2|
|2024-03-01|         3|
|2024-03-02|         3|
+----------+----------+



# **UDFs and Complex Logic**

In [76]:
# 7.1 Write a UDF to tag customers: “Gold” if spend > 50K, “Silver” if 10K–50K, “Bronze” if <10K.
def priceTagger(spend):
  if spend > 50_000:
    return "Gold"
  elif spend < 10_000:
    return "Bronze"
  else:
    return "Silver"

spark.udf.register("priceTagger", priceTagger)

spark.sql("""
   SELECT Name, priceTagger(Tag) AS Tag FROM
          (SELECT c.Name, SUM(o.TotalAmount) AS Tag FROM customers c
          INNER JOIN orders o
          ON c.CustomerID = o.CustomerID
          GROUP BY c.Name) e
""").show()

+-----+------+
| Name|   Tag|
+-----+------+
| Ravi|Bronze|
|Sneha|Bronze|
| Amit|Bronze|
| Neha|Silver|
|  Ali|  Gold|
+-----+------+



In [82]:
# 7.2 Write a UDF to shorten product names (first 3 letters + ...).
def shortner(name):
  nameLen = len(name) - 3
  return name[:3] + ("." * nameLen)

spark.udf.register("shortner", shortner)

spark.sql("""
  SELECT Product, shortner(Product) AS Shortened FROM orders
""").show()

+---------+---------+
|  Product|Shortened|
+---------+---------+
|   Laptop|   Lap...|
|    Mouse|    Mou..|
|   Tablet|   Tab...|
|Bookshelf|Boo......|
|    Mixer|    Mix..|
| Notebook| Not.....|
|    Phone|    Pho..|
+---------+---------+



# **Parquet & Views**

In [83]:
# 8.1 Save the joined result as a Parquet file.
df_joined.write.mode("overwrite").parquet("joined")

In [84]:
# 8.2 Read it back and verify schema.
df_parquet = spark.read.parquet("/content/joined")
df_parquet.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)



In [87]:
# 8.3 Create and query a global temp view.
df_parquet.createOrReplaceGlobalTempView("joined")
spark.sql("SELECT * FROM global_temp.joined").show()

+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+
|CustomerID| Name|            Email|     City|SignupDate|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|      5|    Mixer| Appliances|       1| 5000.0|

In [91]:
df_customers.write.mode("overwrite").parquet("customers")

In [97]:
# 8.4 Compare performance between CSV read and Parquet read.
csv_start = time.time()
df_time = spark.read.csv("/content/customers.csv", header=True, inferSchema=True)
df_time.printSchema()
csv_end = time.time()
csvTime = csv_end - csv_start
print(f"Csv read time taken: {csvTime}")

parquet_start = time.time()
df_parquet_time = spark.read.option("header", "true").parquet(r"/content/customers")
df_parquet_time.printSchema()
parquet_end = time.time()
parquetTime = parquet_end - parquet_start
print(f"Parquet read time taken: {parquetTime}")

percentageDiff = (abs(csvTime - parquetTime) / ((csvTime + parquetTime) / 2)) * 100
print(f"Parquet is {percentageDiff:.2f} % faster than CSV")

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Csv read time taken: 0.4918828010559082
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Parquet read time taken: 0.37914228439331055
Parquet is 25.89 % faster than CSV
