<a href="https://colab.research.google.com/github/Subramaniya-pillai/data_engineering/blob/main/PySpark_Master_Task_Set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark Master Task Set


## Task 1: Data Ingestion & Exploration

In [None]:

from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("PySparkMasterTaskSet").getOrCreate()


In [None]:

# Load the datasets
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("/content/orders (2).csv", header=True, inferSchema=True)

# Show inferred schema for both datasets
customers_df.printSchema()
orders_df.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)



In [None]:

# Count total number of customers and orders
num_customers = customers_df.count()
num_orders = orders_df.count()

print(f"Total number of customers: {num_customers}")
print(f"Total number of orders: {num_orders}")


Total number of customers: 5
Total number of orders: 7


In [None]:

# Show distinct cities
customers_df.select("City").distinct().show()


+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



## Task 2: DataFrame Transformations

In [None]:

from pyspark.sql.functions import col, year

# Add a column TotalAmount = Price * Quantity
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# Create a new column OrderYear from OrderDate
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))

# Filter orders with TotalAmount > 10000
orders_df.filter(col("TotalAmount") > 10000).show()

# Drop the Email column from customers
customers_df = customers_df.drop("Email")
customers_df.show()


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



## Task 3: Handling Nulls & Conditionals

In [None]:

from pyspark.sql.functions import when, lit, to_date

# Simulate a null in City (set City of one row to null)
customers_df = customers_df.withColumn("City", when(col("CustomerID") == 105, None).otherwise(col("City")))

# Fill null City with "Unknown"
customers_df = customers_df.fillna({"City": "Unknown"})

# Label customers as “Loyal” if SignupDate is before 2022, else “New”
customers_df = customers_df.withColumn("CustomerType", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))

# Create OrderType column: "Low" if < 5000, "High" if ≥ 5000
orders_df = orders_df.withColumn("OrderType", when(col("TotalAmount") < 5000, "Low").otherwise("High"))


## Task 4: Joins & Aggregations

In [None]:

from pyspark.sql.functions import sum as _sum, count as _count, desc

# Join customers and orders on CustomerID
joined_df = customers_df.join(orders_df, "CustomerID")

# Get total orders and revenue per city
joined_df.groupBy("City").agg(_count("*").alias("TotalOrders"), _sum("TotalAmount").alias("TotalRevenue")).show()

# Show top 3 customers by total spend
joined_df.groupBy("CustomerID", "Name").agg(_sum("TotalAmount").alias("TotalSpent")).orderBy(desc("TotalSpent")).show(3)

# Count how many products each category has sold
joined_df.groupBy("Category").agg(_sum("Quantity").alias("TotalSold")).show()


+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|      3500.0|
|   Mumbai|          2|    101200.0|
|  Unknown|          1|      2500.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+

+----------+-----+----------+
|CustomerID| Name|TotalSpent|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+
only showing top 3 rows

+-----------+---------+
|   Category|TotalSold|
+-----------+---------+
| Stationery|        5|
|Electronics|        5|
|  Furniture|        1|
| Appliances|        1|
+-----------+---------+



## Task 5: Spark SQL Tasks

In [None]:

# Create database and switch to it
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.catalog.setCurrentDatabase("sales")

# Save datasets as tables
customers_df.write.mode("overwrite").saveAsTable("customers")
orders_df.write.mode("overwrite").saveAsTable("orders")


In [None]:

# SQL Queries
spark.sql("""
SELECT o.* FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE c.City = 'Delhi'
""" ).show()

spark.sql("""
SELECT Category, AVG(TotalAmount) AS AvgOrderValue
FROM orders
GROUP BY Category
""" ).show()

spark.sql("""
CREATE OR REPLACE VIEW monthly_orders AS
SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS MonthlyTotal
FROM orders
GROUP BY MONTH(OrderDate)
""" )

spark.sql("SELECT * FROM monthly_orders").show()


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

+-----+------------+
|Month|MonthlyTotal|
+-----+------------+
|    1|    101200.0|
|    3|     32500.0|
|    2|     28500.0|
+-----+------------+



## Task 6: String & Date Functions

In [None]:

from pyspark.sql.functions import regexp_replace, concat_ws, datediff, current_date, month

# Mask emails using regex (for original email view, reload customers.csv)
customers_raw = spark.read.csv("customers.csv", header=True, inferSchema=True)
customers_masked = customers_raw.withColumn("MaskedEmail", regexp_replace("Email", r"(^.).*(@.*)", r"\1***\2"))

# Concatenate Name and City as “Name from City”
customers_df = customers_df.withColumn("NameCity", concat_ws(" from ", "Name", "City"))

# Use datediff() to calculate customer age in days since SignupDate
customers_df = customers_df.withColumn("AgeInDays", datediff(current_date(), col("SignupDate")))

# Extract month name from OrderDate
from pyspark.sql.functions import date_format
orders_df = orders_df.withColumn("MonthName", date_format("OrderDate", "MMMM"))


## Task 7: UDFs and Complex Logic

In [None]:

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# UDF to tag customers
def customer_tag(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(customer_tag, StringType())

# Apply UDF to total spend per customer
spend_df = joined_df.groupBy("CustomerID").agg(_sum("TotalAmount").alias("TotalSpent"))
spend_df = spend_df.withColumn("CustomerTag", tag_udf("TotalSpent"))
spend_df.show()

# UDF to shorten product names
shorten_udf = udf(lambda name: name[:3] + "..." if len(name) > 3 else name, StringType())
orders_df = orders_df.withColumn("ShortProduct", shorten_udf("Product"))


+----------+----------+-----------+
|CustomerID|TotalSpent|CustomerTag|
+----------+----------+-----------+
|       101|  101200.0|       Gold|
|       103|    3500.0|     Bronze|
|       102|   50000.0|     Silver|
|       105|    2500.0|     Bronze|
|       104|    5000.0|     Bronze|
+----------+----------+-----------+



## Task 8: Parquet & Views

In [None]:

# Save joined result as Parquet
joined_df.write.mode("overwrite").parquet("joined_data.parquet")

# Read back Parquet file and verify schema
parquet_df = spark.read.parquet("joined_data.parquet")
parquet_df.printSchema()

# Create and query a global temp view
parquet_df.createOrReplaceGlobalTempView("global_orders")
spark.sql("SELECT * FROM global_temp.global_orders LIMIT 5").show()

# Compare performance: CSV read vs Parquet read (basic timing comparison)
import time

start_csv = time.time()
spark.read.csv("/content/orders (2).csv", header=True, inferSchema=True).count()
end_csv = time.time()

start_parquet = time.time()
spark.read.parquet("joined_data.parquet").count()
end_parquet = time.time()

print(f"CSV Read Time: {end_csv - start_csv:.2f}s")
print(f"Parquet Read Time: {end_parquet - start_parquet:.2f}s")


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|   

In [None]:

# UDF to shorten product names (first 3 letters + ...)
shorten_udf = udf(lambda name: name[:3] + "..." if name else "", StringType())
orders_df = orders_df.withColumn("ShortProduct", shorten_udf("Product"))
orders_df.select("Product", "ShortProduct").show()


+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



## Task 8: Parquet & Views

In [None]:

# Save the joined result as a Parquet file
parquet_path = "/tmp/joined_data.parquet"
joined_df.write.mode("overwrite").parquet(parquet_path)

# Read it back and verify schema
parquet_df = spark.read.parquet(parquet_path)
parquet_df.printSchema()
parquet_df.show()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|   

In [None]:

# Create and query a global temp view
parquet_df.createOrReplaceGlobalTempView("global_joined_view")
spark.sql("SELECT City, COUNT(*) as Orders FROM global_temp.global_joined_view GROUP BY City").show()


+---------+------+
|     City|Orders|
+---------+------+
|Bangalore|     1|
|   Mumbai|     2|
|  Unknown|     1|
|    Delhi|     2|
|Hyderabad|     1|
+---------+------+



In [None]:

# Compare performance between CSV read and Parquet read (example via timing)
import time

start_csv = time.time()
csv_test = spark.read.csv("/content/orders (2).csv", header=True, inferSchema=True).count()
end_csv = time.time()

start_parquet = time.time()
parquet_test = spark.read.parquet(parquet_path).count()
end_parquet = time.time()

print(f"CSV read time: {end_csv - start_csv:.4f} seconds")
print(f"Parquet read time: {end_parquet - start_parquet:.4f} seconds")


CSV read time: 0.8908 seconds
Parquet read time: 0.4739 seconds
