In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [6]:
spark = SparkSession.builder.appName("deo").enableHiveSupport().getOrCreate()

In [7]:
spark

# **Data Preparation**

In [64]:
customers_data = [
(101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
(102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
(103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
(104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
(105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]

orders_data = [
(1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
(2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
(3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
(4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
(5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
(6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
(7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email",
"City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product",
"Category", "Quantity", "Price", "OrderDate"])

spark.sql("CREATE DATABASE IF NOT EXISTS sales")

customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

# **PySpark DataFrame Tasks**

In [65]:
# 1. Add a column TotalAmount = Price * Quantity to the orders_df .
orders_df = orders_df.withColumn("TotalAmount", orders_df.Price * orders_df.Quantity)
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [66]:
# 2. Filter all orders with TotalAmount > 10000 .
orders_df.filter(orders_df.TotalAmount > 10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+



In [67]:
# 3. Standardize the City field in customers_df (e.g., lowercase).
customers_df = customers_df.withColumn("City", F.lower(customers_df.City))
customers_df.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



In [68]:
# 4. Extract year from OrderDate and add a new column OrderYear .
orders_df = orders_df.withColumn("OrderYear", F.year(orders_df.OrderDate))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [69]:
# 5. Fill null values in any column of your choice with defaults.
orders_df.fillna(-1).show()
customers_df.fillna(-1).show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [70]:
# 6. Use when/otherwise to categorize
orders_df = orders_df.withColumn("Cateogry", F.when(orders_df.TotalAmount < 5000, "Low").when((orders_df.TotalAmount > 5000) &  (orders_df.TotalAmount <= 20000), "Medium").otherwise("High"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|Cateogry|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|    High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|     Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|  Medium|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|     Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|    High|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|     Low|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|

# **Spark SQL Tasks**

In [33]:
# 7. Run a SQL query to list all orders made by “Ali”.
spark.sql("USE sales ")
spark.sql("""
  SELECT * FROM orders
  WHERE CustomerID = (SELECT CustomerID FROM customers
  WHERE Name = 'Ali')
""").show()

+-------+----------+-------+-----------+--------+-------+----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+-------+-----------+--------+-------+----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|
+-------+----------+-------+-----------+--------+-------+----------+



In [35]:
# 8. Get total spending by each customer using SQL.
spark.sql("""
  SELECT c.Name, SUM(o.Quantity * o.Price) AS Spending FROM customers c
  INNER JOIN orders o
  ON c.CustomerID = o.CustomerID
  GROUP BY c.Name
""").show()

+-----+--------+
| Name|Spending|
+-----+--------+
| Neha| 50000.0|
|  Ali|101200.0|
| Ravi|  3500.0|
|Sneha|  5000.0|
| Amit|  2500.0|
+-----+--------+



In [38]:
# 9. Find out which category made the highest total revenue.
spark.sql("""
  SELECT Product, SUM(Price * Quantity) AS Revenue FROM orders
  GROUP BY Product
  ORDER BY SUM(Price * Quantity) DESC
  LIMIT 1
""").show()

+-------+--------+
|Product| Revenue|
+-------+--------+
| Laptop|100000.0|
+-------+--------+



In [47]:
# 10. Create a view customer_orders showing CustomerName, Product, TotalAmount .
df = spark.sql("""
  SELECT o.OrderID, c.Name, o.Product, (o.Quantity * o.Price) AS `Total Amount` FROM orders o
  INNER JOIN customers c
  ON c.CustomerID = o.CustomerID
""")

df.createOrReplaceTempView("customer_orders")

spark.sql("SELECT * FROM customer_orders").show()

+-------+-----+---------+------------+
|OrderID| Name|  Product|Total Amount|
+-------+-----+---------+------------+
|      1|  Ali|   Laptop|    100000.0|
|      2|  Ali|    Mouse|      1200.0|
|      3| Neha|   Tablet|     20000.0|
|      4| Ravi|Bookshelf|      3500.0|
|      5|Sneha|    Mixer|      5000.0|
|      6| Amit| Notebook|      2500.0|
|      7| Neha|    Phone|     30000.0|
+-------+-----+---------+------------+



In [49]:
# 11. Query the view for products ordered after Feb 2024.
spark.sql("""
  SELECT * FROM customer_orders
  WHERE OrderID IN (SELECT OrderID FROM orders
  WHERE OrderDate > DATE('2024-02-28'))
""").show()

+-------+----+--------+------------+
|OrderID|Name| Product|Total Amount|
+-------+----+--------+------------+
|      6|Amit|Notebook|      2500.0|
|      7|Neha|   Phone|     30000.0|
+-------+----+--------+------------+



# **Advanced Practice**

In [51]:
# 12. Create a Global Temp View from customers_df , then query it using:
customers_df.createOrReplaceGlobalTempView("customer_global")

spark.sql("""
  SELECT * FROM global_temp.customer_global
  WHERE City = 'mumbai'
""").show()

+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|
+----------+----+-------------+------+----------+



In [71]:
# 13. Save the transformed orders_df (with TotalAmount) to a Parquet file.
orders_df.write.mode("overwrite").parquet("orders_parquet")

In [75]:
# 14. Read back the Parquet file and count how many orders are in it.
orders_parquet = spark.read.parquet("/content/orders_parquet")
print(f"Orders -> {orders_parquet.count()}")

Orders -> 7


# **UDF + Build-in Function Tasks**

In [84]:
# 15. Write a UDF that masks emails like: ali@gmail.com → a***@gmail.com .
def email_masker(email):
  emailLen = len(email)
  astricks = emailLen - 11
  return email[0] + ("*" * astricks) + "@gmail.com"

spark.udf.register("email_masker", email_masker)

spark.sql("SELECT Name, email_masker(Email) AS `EMail`, City, SignupDate FROM customers").show()

+-----+-----------------+---------+----------+
| Name|            EMail|     City|SignupDate|
+-----+-----------------+---------+----------+
| Ravi| r*****@gmail.com|Bangalore|2021-11-01|
|Sneha|s******@gmail.com|Hyderabad|2020-07-22|
| Amit|   a***@gmail.com|  Chennai|2023-03-10|
|  Ali|    a**@gmail.com|   Mumbai|2022-05-10|
| Neha|   n***@gmail.com|    Delhi|2023-01-15|
+-----+-----------------+---------+----------+



In [88]:
# 16. Use concat_ws() to create a full label like: 'Ali from Mumbai' .
spark.sql("SELECT CONCAT_WS(' ', Name, 'from', City) AS `CustomerInfo` FROM customers").show()

+--------------------+
|        CustomerInfo|
+--------------------+
| Ravi from Bangalore|
|Sneha from Hyderabad|
|   Amit from Chennai|
|     Ali from Mumbai|
|     Neha from Delhi|
+--------------------+



In [91]:
# 17. Use regexp_replace() to remove special characters from product names.
spark.sql("SELECT REGEXP_REPLACE(Product, '[^a-zA-Z0-9 ]', '') AS `Product` FROM orders").show()

+---------+
|  Product|
+---------+
|   Laptop|
|    Mouse|
|   Tablet|
|Bookshelf|
|    Mixer|
| Notebook|
|    Phone|
+---------+



In [97]:
# 18. Use to_date() and datediff() to calculate customer age in days (from SignupDate to today).
spark.sql("""
  SELECT Name, Email, City, SignupDate, INT((DATEDIFF(CURRENT_DATE, TO_DATE(SignupDate, 'yyyy-MM-dd'))) / 365) AS CustomerAge FROM customers
""").show()

+-----+-----------------+---------+----------+-----------+
| Name|            Email|     City|SignupDate|CustomerAge|
+-----+-----------------+---------+----------+-----------+
| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|          3|
|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|          4|
| Amit|   amit@gmail.com|  Chennai|2023-03-10|          2|
|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|          3|
| Neha|   neha@yahoo.com|    Delhi|2023-01-15|          2|
+-----+-----------------+---------+----------+-----------+

