In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, when, col, concat_ws, regexp_replace, to_date, datediff, udf
from pyspark.sql.types import StringType
spark = SparkSession.builder.appName("PracticeProject_FriendVersion").enableHiveSupport().getOrCreate()

# Customers data with slightly changed names and cities
customers_data = [
    (101, 'Manikandan', 'manikandan@gmail.com', 'Chennai', '2022-05-10'),
    (102, 'Saravanan', 'saravanan@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'Motesh', 'motesh@hotmail.com', 'Bengaluru', '2021-11-01'),
    (104, 'Manoj', 'manoj@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Jeevan', 'jeevan@gmail.com', 'Coimbatore', '2023-03-10'),
]

orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

cust_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
ord_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])

# Save as tables
cust_df.write.mode("overwrite").saveAsTable("practice.customers")
ord_df.write.mode("overwrite").saveAsTable("practice.orders")


In [None]:
# 1. Add TotalAmount column
ord_df = ord_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
ord_df.show()


In [None]:
# 2. Filter orders with TotalAmount > 10000
ord_df.filter(col("TotalAmount") > 10000).show()


In [None]:
# 3. Lowercase City in customers
cust_df = cust_df.withColumn("City", expr("lower(City)"))
cust_df.show()


In [None]:
# 4. Extract OrderYear from OrderDate
ord_df = ord_df.withColumn("OrderYear", expr("year(to_date(OrderDate))"))
ord_df.show()


In [None]:
# 5. Fill null Email with default
cust_df = cust_df.na.fill({"Email": "unknown@example.com"})
cust_df.show()


In [None]:
# 6. Categorize orders by TotalAmount
ord_df = ord_df.withColumn("CategoryLevel", when(col("TotalAmount") < 5000, "Low")                           .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")                           .otherwise("High"))
ord_df.select("OrderID", "TotalAmount", "CategoryLevel").show()


In [None]:
# 7. SQL Query: Orders by Manikandan
spark.sql("""
SELECT o.* FROM practice.customers c
JOIN practice.orders o ON c.CustomerID = o.CustomerID
WHERE c.Name = 'Manikandan'
""").show()


In [None]:
# 8. SQL: Total spending per customer
spark.sql("""
SELECT c.Name, SUM(o.TotalAmount) AS TotalSpent
FROM practice.customers c
JOIN practice.orders o ON c.CustomerID = o.CustomerID
GROUP BY c.Name
ORDER BY TotalSpent DESC
""").show()


In [None]:
# 9. SQL: Category with highest total revenue
spark.sql("""
SELECT Category, SUM(TotalAmount) AS TotalRevenue
FROM practice.orders
GROUP BY Category
ORDER BY TotalRevenue DESC
LIMIT 1
""").show()


In [None]:
# 10. Create view customer_orders
spark.sql("""
CREATE OR REPLACE VIEW practice.customer_orders AS
SELECT c.Name AS CustomerName, o.Product, o.TotalAmount
FROM practice.customers c
JOIN practice.orders o ON c.CustomerID = o.CustomerID
""")


In [None]:
# 11. Query customer_orders for products ordered after Feb 2024
spark.sql("""
SELECT * FROM practice.customer_orders co
JOIN practice.orders o ON co.Product = o.Product
WHERE o.OrderDate > '2024-02-28'
""").show()


In [None]:
# 12. Global temp view and query
cust_df.createGlobalTempView("customers")
spark.sql("""
SELECT * FROM global_temp.customers WHERE City = 'chennai'
""").show()


In [None]:
# 13. Save orders with TotalAmount to Parquet
ord_df.write.mode("overwrite").parquet("/tmp/practice_orders.parquet")


In [None]:
# 14. Read Parquet and count
parquet_df = spark.read.parquet("/tmp/practice_orders.parquet")
print(f"Orders count in parquet: {parquet_df.count()}")


In [None]:
# 15. UDF to mask email
def mask_email(email):
    if email and '@' in email:
        parts = email.split('@')
        return parts[0][0] + '***@' + parts[1]
    return email

mask_email_udf = udf(mask_email, StringType())
cust_df = cust_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))
cust_df.select("Email", "MaskedEmail").show()


In [None]:
# 16. Create full label
cust_df = cust_df.withColumn("FullLabel", concat_ws(" ", col("Name"), lit("from"), col("City")))
cust_df.select("FullLabel").show()


In [None]:
# 17. Clean Product names
ord_df = ord_df.withColumn("CleanProduct", regexp_replace(col("Product"), "[^a-zA-Z0-9 ]", ""))
ord_df.select("Product", "CleanProduct").show()


In [None]:
# 18. Customer age in days
from pyspark.sql.functions import current_date
cust_df = cust_df.withColumn("SignupDate_dt", to_date(col("SignupDate")))
cust_df = cust_df.withColumn("AgeInDays", datediff(current_date(), col("SignupDate_dt")))
cust_df.select("Name", "SignupDate", "AgeInDays").show()
