In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, when, col, year, lit, concat_ws, regexp_replace, to_date, datediff, udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName("PracticeProject").enableHiveSupport().getOrCreate()

# Create schema if not exists
spark.sql("CREATE DATABASE IF NOT EXISTS sales")

DataFrame[]

In [None]:
# Customers data with names and cities
customers_data = [
    (101, 'nithya', 'nithya@gmail.com', 'Chennai', '2022-05-10'),
    (102, 'mani', 'mani@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'gopal', 'gopal@hotmail.com', 'Bengaluru', '2021-11-01'),
    (104, 'shakthi', 'shakthi@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Jeevan', 'jeevan@gmail.com', 'Coimbatore', '2023-03-10'),
]

orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])

# Save as Hive tables
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

# **SECTION A: PySpark DataFrame Tasks**


In [None]:
# 1. Add TotalAmount column to orders_df
orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [None]:
# 2. Filter orders with TotalAmount > 10000
orders_df.filter(col("TotalAmount") > 10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+



In [None]:
# 3. Standardize City to lowercase in customers_df
customers_df = customers_df.withColumn("City", expr("lower(City)"))
customers_df.show()

+----------+-------+-------------------+----------+----------+
|CustomerID|   Name|              Email|      City|SignupDate|
+----------+-------+-------------------+----------+----------+
|       101| nithya|   nithya@gmail.com|   chennai|2022-05-10|
|       102|   mani|     mani@yahoo.com|     delhi|2023-01-15|
|       103|  gopal|  gopal@hotmail.com| bengaluru|2021-11-01|
|       104|shakthi|shakthi@outlook.com| hyderabad|2020-07-22|
|       105| Jeevan|   jeevan@gmail.com|coimbatore|2023-03-10|
+----------+-------+-------------------+----------+----------+



In [None]:
# 4. Extract year from OrderDate as OrderYear
orders_df = orders_df.withColumn("OrderYear", year(to_date(col("OrderDate"))))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [None]:
# 5. Fill nulls in Email with default email 'unknown@example.com' (example)
customers_df = customers_df.na.fill({"Email": "unknown@example.com"})
customers_df.show()

+----------+-------+-------------------+----------+----------+
|CustomerID|   Name|              Email|      City|SignupDate|
+----------+-------+-------------------+----------+----------+
|       101| nithya|   nithya@gmail.com|   chennai|2022-05-10|
|       102|   mani|     mani@yahoo.com|     delhi|2023-01-15|
|       103|  gopal|  gopal@hotmail.com| bengaluru|2021-11-01|
|       104|shakthi|shakthi@outlook.com| hyderabad|2020-07-22|
|       105| Jeevan|   jeevan@gmail.com|coimbatore|2023-03-10|
+----------+-------+-------------------+----------+----------+



In [None]:
# 6. Categorize orders using when/otherwise on TotalAmount
orders_df = orders_df.withColumn("AmountCategory",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High")
)
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|AmountCategory|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|          High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|           Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|        Medium|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|           Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|        Medium|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|           Low|
|      7|       102|    Phon

# SECTION B: Spark SQL **Tasks**

In [None]:
# 7. List all orders made by "Mani"
customers_df.createOrReplaceTempView("customers_temp")
orders_df.createOrReplaceTempView("orders_temp")

spark.sql("""
SELECT o.*
FROM customers_temp c
JOIN orders_temp o ON c.CustomerID = o.CustomerID
WHERE c.Name = 'Mani'
""").show()

+-------+----------+-------+--------+--------+-----+---------+-----------+---------+--------------+
|OrderID|CustomerID|Product|Category|Quantity|Price|OrderDate|TotalAmount|OrderYear|AmountCategory|
+-------+----------+-------+--------+--------+-----+---------+-----------+---------+--------------+
+-------+----------+-------+--------+--------+-----+---------+-----------+---------+--------------+



In [None]:
# 8. Total spending by each customer (compute TotalAmount in SQL)
spark.sql("""
SELECT c.Name, SUM(o.Quantity * o.Price) AS TotalSpending
FROM sales.customers c
JOIN sales.orders o ON c.CustomerID = o.CustomerID
GROUP BY c.Name
""").show()

+-------+-------------+
|   Name|TotalSpending|
+-------+-------------+
|   mani|      50000.0|
| nithya|     101200.0|
| Jeevan|       2500.0|
|  gopal|       3500.0|
|shakthi|       5000.0|
+-------+-------------+



In [None]:
# 9. Category with highest total revenue (compute in SQL)
spark.sql("""
SELECT Category, SUM(Quantity * Price) AS TotalRevenue
FROM sales.orders
GROUP BY Category
ORDER BY TotalRevenue DESC
LIMIT 1
""").show()

+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|    151200.0|
+-----------+------------+



In [None]:
# 10. Create view customer_orders (CustomerName, Product, TotalAmount)
spark.sql("""
CREATE OR REPLACE VIEW sales.customer_orders AS
SELECT c.Name AS CustomerName, o.Product, (o.Quantity * o.Price) AS TotalAmount
FROM sales.customers c
JOIN sales.orders o ON c.CustomerID = o.CustomerID
""")

DataFrame[]

In [None]:
# 11. Query the view for products ordered after Feb 2024
spark.sql("""
SELECT *
FROM sales.customer_orders co
JOIN sales.orders o ON co.Product = o.Product
WHERE o.OrderDate > '2024-02-01'
""").show()

+------------+---------+-----------+-------+----------+---------+-----------+--------+-------+----------+
|CustomerName|  Product|TotalAmount|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+------------+---------+-----------+-------+----------+---------+-----------+--------+-------+----------+
|       gopal|Bookshelf|     3500.0|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|
|     shakthi|    Mixer|     5000.0|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|
|      Jeevan| Notebook|     2500.0|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|
|        mani|    Phone|    30000.0|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|
+------------+---------+-----------+-------+----------+---------+-----------+--------+-------+----------+



# **SECTION C: Advanced Practice**

In [None]:
# 12. Global temp view from customers_df and query Mumbai customers
customers_df.createGlobalTempView("customers")
spark.sql("""
SELECT * FROM global_temp.customers WHERE City = 'mumbai'
""").show()

+----------+----+-----+----+----------+
|CustomerID|Name|Email|City|SignupDate|
+----------+----+-----+----+----------+
+----------+----+-----+----+----------+



In [None]:
# 13. Save transformed orders_df with TotalAmount as Parquet
orders_df.write.mode("overwrite").parquet("/tmp/orders_with_total.parquet")

In [None]:
# 14. Read back Parquet and count orders
parquet_df = spark.read.parquet("/tmp/orders_with_total.parquet")
print(f"Total orders in parquet file: {parquet_df.count()}")

Total orders in parquet file: 7


# SECTION D: UDF + Built-in Function **Tasks**

In [None]:
# 15. UDF to mask emails like a***@gmail.com
def mask_email(email):
    try:
        local, domain = email.split("@")
        return local[0] + "***@" + domain
    except:
        return email

mask_email_udf = udf(mask_email, StringType())
customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))
customers_df.select("Email", "MaskedEmail").show()

+-------------------+----------------+
|              Email|     MaskedEmail|
+-------------------+----------------+
|   nithya@gmail.com|  n***@gmail.com|
|     mani@yahoo.com|  m***@yahoo.com|
|  gopal@hotmail.com|g***@hotmail.com|
|shakthi@outlook.com|s***@outlook.com|
|   jeevan@gmail.com|  j***@gmail.com|
+-------------------+----------------+



In [None]:
# 16. concat_ws() full label 'Name from City'
customers_df = customers_df.withColumn("Label", concat_ws(" from ", col("Name"), col("City")))
customers_df.select("Label").show()

+--------------------+
|               Label|
+--------------------+
| nithya from chennai|
|     mani from delhi|
|gopal from bengaluru|
|shakthi from hyde...|
|Jeevan from coimb...|
+--------------------+



In [None]:
# 17. Remove special characters from Product names
orders_df = orders_df.withColumn("CleanProduct", regexp_replace(col("Product"), "[^a-zA-Z0-9 ]", ""))
orders_df.select("Product", "CleanProduct").show()

+---------+------------+
|  Product|CleanProduct|
+---------+------------+
|   Laptop|      Laptop|
|    Mouse|       Mouse|
|   Tablet|      Tablet|
|Bookshelf|   Bookshelf|
|    Mixer|       Mixer|
| Notebook|    Notebook|
|    Phone|       Phone|
+---------+------------+



In [None]:
# 18. Calculate customer age in days from SignupDate to today
from pyspark.sql.functions import current_date
customers_df = customers_df.withColumn("SignupDate", to_date(col("SignupDate")))
customers_df = customers_df.withColumn("DaysSinceSignup", datediff(current_date(), col("SignupDate")))
customers_df.select("Name", "SignupDate", "DaysSinceSignup").show()

+-------+----------+---------------+
|   Name|SignupDate|DaysSinceSignup|
+-------+----------+---------------+
| nithya|2022-05-10|           1121|
|   mani|2023-01-15|            871|
|  gopal|2021-11-01|           1311|
|shakthi|2020-07-22|           1778|
| Jeevan|2023-03-10|            817|
+-------+----------+---------------+

