In [1]:
!pip install pyspark -q

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("PySparkMasterTask") \
    .getOrCreate()

In [3]:
from google.colab import files
uploaded = files.upload()

Saving orders.csv to orders (1).csv
Saving customers.csv to customers.csv


In [4]:
# Load customers.csv
customers_df = spark.read.option("header", "true") \
                         .option("inferSchema", "true") \
                         .csv("customers.csv")

# Load orders.csv
orders_df = spark.read.option("header", "true") \
                      .option("inferSchema", "true") \
                      .csv("orders.csv")

# Show a preview
customers_df.show()
orders_df.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|
|      4|       103|Bookshelf|  Furniture|       1|

**TASK 1: Data Ingestion &** **Exploration**

In [6]:
# Columns and DataTypes
customers_df.printSchema()
orders_df.printSchema()

# Total Count
customers_df.count()
orders_df.count()

# Distinct Cities
customers_df.select("City").distinct().show()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



**Task 2: DataFrame Transformations**

In [7]:
from pyspark.sql.functions import col, year

# Add TotalAmount = Price * Quantity
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# Create OrderYear from OrderDate
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))

# Filter orders with TotalAmount > 10,000
high_value_orders_df = orders_df.filter(col("TotalAmount") > 10000)

# Drop Email column from customers
customers_no_email_df = customers_df.drop("Email")

# Show results
print("Orders with TotalAmount > 10,000:")
high_value_orders_df.show()

print("Customers DataFrame without Email column:")
customers_no_email_df.show()

Orders with TotalAmount > 10,000:
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+

Customers DataFrame without Email column:
+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|     

**Task 3: Handling Nulls &** **Conditionals**

In [8]:
from pyspark.sql.functions import when, lit, to_date

# Simulate a null in City for one customer (e.g., CustomerID=103)
customers_null_city_df = customers_df.withColumn(
    "City",
    when(col("CustomerID") == 103, None).otherwise(col("City"))
)

# Fill nulls in City with "Unknown"
customers_filled_city_df = customers_null_city_df.fillna({"City": "Unknown"})

# Label customers as "Loyal" if SignupDate before 2022, else "New"
customers_labeled_df = customers_filled_city_df.withColumn(
    "CustomerLabel",
    when(to_date(col("SignupDate")) < lit("2022-01-01"), "Loyal").otherwise("New")
)

# Create OrderType: "Low" if TotalAmount < 5000, "High" if >= 5000
orders_labeled_df = orders_df.withColumn(
    "OrderType",
    when(col("TotalAmount") < 5000, "Low").otherwise("High")
)

# Show results
print("Customers with City null simulated and filled, and labeled:")
customers_labeled_df.show()

print("Orders with OrderType column:")
orders_labeled_df.show()


Customers with City null simulated and filled, and labeled:
+----------+-----+-----------------+---------+----------+-------------+
|CustomerID| Name|            Email|     City|SignupDate|CustomerLabel|
+----------+-----+-----------------+---------+----------+-------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|          New|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|          New|
|       103| Ravi| ravi@hotmail.com|  Unknown|2021-11-01|        Loyal|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|        Loyal|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|          New|
+----------+-----+-----------------+---------+----------+-------------+

Orders with OrderType column:
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+---------+-----------+--------+----

**Task 4: Joins & Aggregations**

In [9]:
from pyspark.sql.functions import sum as spark_sum, desc, count

# Join customers and orders on CustomerID
joined_df = customers_df.join(orders_df, on="CustomerID", how="inner")

# Total orders and revenue per city
orders_per_city = joined_df.groupBy("City") \
                          .agg(
                              count("OrderID").alias("TotalOrders"),
                              spark_sum("TotalAmount").alias("TotalRevenue")
                          ) \
                          .orderBy(desc("TotalRevenue"))

# Top 3 customers by total spend
top_customers = joined_df.groupBy("CustomerID", "Name") \
                         .agg(spark_sum("TotalAmount").alias("TotalSpend")) \
                         .orderBy(desc("TotalSpend")) \
                         .limit(3)

# Count how many products each category has sold
products_sold = orders_df.groupBy("Category") \
                        .agg(spark_sum("Quantity").alias("TotalQuantity")) \
                        .orderBy(desc("TotalQuantity"))

# Show results
print("Total Orders and Revenue per City:")
orders_per_city.show()

print("Top 3 Customers by Total Spend:")
top_customers.show()

print("Total Products Sold per Category:")
products_sold.show()

Total Orders and Revenue per City:
+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|   Mumbai|          2|    101200.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
|Bangalore|          1|      3500.0|
|  Chennai|          1|      2500.0|
+---------+-----------+------------+

Top 3 Customers by Total Spend:
+----------+-----+----------+
|CustomerID| Name|TotalSpend|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+

Total Products Sold per Category:
+-----------+-------------+
|   Category|TotalQuantity|
+-----------+-------------+
| Stationery|            5|
|Electronics|            5|
|  Furniture|            1|
| Appliances|            1|
+-----------+-------------+



**Task 5: Spark SQL Tasks**

In [10]:
# Create database sales and switch to it
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

# Save customers and orders as tables
customers_df.write.mode("overwrite").saveAsTable("customers")
orders_df.write.mode("overwrite").saveAsTable("orders")

# 1. List all orders by customers from "Delhi"
delhi_orders = spark.sql("""
    SELECT o.*
    FROM orders o
    JOIN customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""")
print("Orders by customers from Delhi:")
delhi_orders.show()

# 2. Find average order value in each category
avg_order_value = spark.sql("""
    SELECT Category, AVG(Price * Quantity) as AvgOrderValue
    FROM orders
    GROUP BY Category
""")
print("Average order value per category:")
avg_order_value.show()

# 3. Create view monthly_orders with month-wise total amount
spark.sql("""
    CREATE OR REPLACE VIEW monthly_orders AS
    SELECT
        MONTH(OrderDate) as OrderMonth,
        SUM(Price * Quantity) as TotalAmount
    FROM orders
    GROUP BY MONTH(OrderDate)
""")

# Query the view
print("Monthly Orders Summary:")
spark.sql("SELECT * FROM monthly_orders ORDER BY OrderMonth").show()

Orders by customers from Delhi:
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+

Average order value per category:
+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

Monthly Orders Summary:
+----------+-----------+
|OrderMonth|TotalAmount|
+----------+-----------+
|         1|   101200.0|
|         2|    28500.0|
|         3| 

**Task 6: String & Date Functions**

In [11]:
from pyspark.sql.functions import regexp_replace, concat_ws, datediff, current_date, month, date_format

# Mask emails (e.g., a***@gmail.com)
def mask_email(email):
    import re
    if email:
        return re.sub(r'^(.)(.*)(.@.*)$', lambda m: m.group(1) + '***' + m.group(3), email)
    return email

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

mask_email_udf = udf(mask_email, StringType())
customers_masked_email_df = customers_df.withColumn("MaskedEmail", mask_email_udf("Email"))

# Concatenate Name and City as "Name from City"
customers_concat_df = customers_masked_email_df.withColumn(
    "NameFromCity",
    concat_ws(" from ", col("Name"), col("City"))
)

# Calculate customer age in days (difference between current_date and SignupDate)
customers_age_df = customers_concat_df.withColumn(
    "CustomerAgeDays",
    datediff(current_date(), col("SignupDate"))
)

# Extract month name from OrderDate
orders_monthname_df = orders_df.withColumn(
    "OrderMonthName",
    date_format(col("OrderDate"), "MMMM")
)

# Show results
print("Customers with masked email, concatenated name and city, and customer age:")
customers_age_df.select("Name", "Email", "MaskedEmail", "NameFromCity", "SignupDate", "CustomerAgeDays").show()

print("Orders with extracted month name:")
orders_monthname_df.select("OrderID", "OrderDate", "OrderMonthName").show()


Customers with masked email, concatenated name and city, and customer age:
+-----+-----------------+-----------------+--------------------+----------+---------------+
| Name|            Email|      MaskedEmail|        NameFromCity|SignupDate|CustomerAgeDays|
+-----+-----------------+-----------------+--------------------+----------+---------------+
|  Ali|    ali@gmail.com|  a***i@gmail.com|     Ali from Mumbai|2022-05-10|           1126|
| Neha|   neha@yahoo.com|  n***a@yahoo.com|     Neha from Delhi|2023-01-15|            876|
| Ravi| ravi@hotmail.com|r***i@hotmail.com| Ravi from Bangalore|2021-11-01|           1316|
|Sneha|sneha@outlook.com|s***a@outlook.com|Sneha from Hyderabad|2020-07-22|           1783|
| Amit|   amit@gmail.com|  a***t@gmail.com|   Amit from Chennai|2023-03-10|            822|
+-----+-----------------+-----------------+--------------------+----------+---------------+

Orders with extracted month name:
+-------+----------+--------------+
|OrderID| OrderDate|OrderM

**Task 7: UDFs and Complex Logic**

In [12]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

# UDF to tag customers based on total spend
def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_customer_udf = udf(tag_customer, StringType())

# Calculate total spend per customer first
total_spend_df = orders_df.groupBy("CustomerID").agg({"TotalAmount": "sum"}) \
    .withColumnRenamed("sum(TotalAmount)", "TotalSpend")

# Join total spend with customers
customers_spend_df = customers_df.join(total_spend_df, on="CustomerID", how="left") \
    .fillna({"TotalSpend": 0})

# Apply tagging UDF
customers_tagged_df = customers_spend_df.withColumn("CustomerTag", tag_customer_udf("TotalSpend"))

# UDF to shorten product names (first 3 letters + "...")
def shorten_product(name):
    if name and len(name) > 3:
        return name[:3] + "..."
    return name

shorten_product_udf = udf(shorten_product, StringType())

# Apply product name shortening to orders
orders_shortened_df = orders_df.withColumn("ShortProduct", shorten_product_udf("Product"))

# Show results
print("Customers with Tags:")
customers_tagged_df.select("CustomerID", "Name", "TotalSpend", "CustomerTag").show()

print("Orders with Shortened Product Names:")
orders_shortened_df.select("OrderID", "Product", "ShortProduct").show()


Customers with Tags:
+----------+-----+----------+-----------+
|CustomerID| Name|TotalSpend|CustomerTag|
+----------+-----+----------+-----------+
|       101|  Ali|  101200.0|       Gold|
|       102| Neha|   50000.0|     Silver|
|       103| Ravi|    3500.0|     Bronze|
|       104|Sneha|    5000.0|     Bronze|
|       105| Amit|    2500.0|     Bronze|
+----------+-----+----------+-----------+

Orders with Shortened Product Names:
+-------+---------+------------+
|OrderID|  Product|ShortProduct|
+-------+---------+------------+
|      1|   Laptop|      Lap...|
|      2|    Mouse|      Mou...|
|      3|   Tablet|      Tab...|
|      4|Bookshelf|      Boo...|
|      5|    Mixer|      Mix...|
|      6| Notebook|      Not...|
|      7|    Phone|      Pho...|
+-------+---------+------------+



**Task 8: Parquet & Views**

In [13]:
# Save the joined DataFrame (customers + orders) as Parquet
joined_df.write.mode("overwrite").parquet("joined_customers_orders.parquet")

# Read it back and verify schema
parquet_df = spark.read.parquet("joined_customers_orders.parquet")
print("Schema of Parquet file:")
parquet_df.printSchema()

# Create a global temp view
parquet_df.createGlobalTempView("global_joined_view")

# Query the global temp view
print("Data from global temp view:")
spark.sql("SELECT * FROM global_temp.global_joined_view").show(5)

# Performance comparison: CSV vs Parquet
import time

# CSV read timing
start_csv = time.time()
csv_read_df = spark.read.option("header", "true").csv("customers.csv")
csv_read_df.count()  # Trigger action
end_csv = time.time()

# Parquet read timing
start_parquet = time.time()
parquet_read_df = spark.read.parquet("joined_customers_orders.parquet")
parquet_read_df.count()  # Trigger action
end_parquet = time.time()

print(f"CSV read time: {end_csv - start_csv:.4f} seconds")
print(f"Parquet read time: {end_parquet - start_parquet:.4f} seconds")


Schema of Parquet file:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)

Data from global temp view:
+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+
|CustomerID| Name|            Email|     City|SignupDate|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+
|       101|  A