In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkAssessment").getOrCreate()


In [2]:
from google.colab import files

# This opens a file upload dialog in Colab
uploaded = files.upload()


Saving orders.csv to orders.csv
Saving customers.csv to customers.csv


In [3]:
!ls


customers.csv  orders.csv  sample_data


In [15]:
# Load datasets
customers_df = spark.read.option("header", True).option("inferSchema", True).csv("customers.csv")
orders_df = spark.read.option("header", True).option("inferSchema", True).csv("orders.csv")

# View data
customers_df.show()
orders_df.show()


+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|
|      4|       103|Bookshelf|  Furniture|       1|

#1. Data Ingestion & Exploration

In [7]:

# List schema
# List all columns and data types
customers_df.printSchema()
orders_df.printSchema()

# Count total customers and orders
print("Total customers:", customers_df.count())
print("Total orders:", orders_df.count())

# Show distinct cities
customers_df.select("City").distinct().show()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)

Total customers: 5
Total orders: 7
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



#2. DataFrame Transformations

In [8]:
from pyspark.sql.functions import col, year

# Add TotalAmount column
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# Extract OrderYear from OrderDate
orders_df = orders_df.withColumn("OrderYear", year("OrderDate"))

# Filter orders where TotalAmount > 10,000
orders_df.filter(col("TotalAmount") > 10000).show()

# Drop the Email column
customers_df = customers_df.drop("Email")


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



#3. Handling Nulls & Conditionals

In [11]:
from pyspark.sql.functions import when, to_date, col

# Simulate null in City
customers_df = customers_df.withColumn("City", when(col("CustomerID") == 102, None).otherwise(col("City")))
customers_df = customers_df.fillna("Unknown", subset=["City"])

# Show to confirm null was replaced with "Unknown"
print(" City column after simulating NULL and replacing with 'Unknown':")
customers_df.select("CustomerID", "Name", "City").show()

# Label customers
customers_df = customers_df.withColumn(
    "CustomerType",
    when(to_date("SignupDate") < "2022-01-01", "Loyal").otherwise("New")
)

# Show to confirm CustomerType
print("CustomerType based on SignupDate:")
customers_df.select("CustomerID", "SignupDate", "CustomerType").show()

# Create OrderType column in orders
orders_df = orders_df.withColumn(
    "OrderType",
    when(col("TotalAmount") < 5000, "Low").otherwise("High")
)

# Show to confirm OrderType
print("OrderType based on TotalAmount:")
orders_df.select("OrderID", "TotalAmount", "OrderType").show()


 City column after simulating NULL and replacing with 'Unknown':
+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|       101|  Ali|   Mumbai|
|       102| Neha|  Unknown|
|       103| Ravi|Bangalore|
|       104|Sneha|Hyderabad|
|       105| Amit|  Chennai|
+----------+-----+---------+

CustomerType based on SignupDate:
+----------+----------+------------+
|CustomerID|SignupDate|CustomerType|
+----------+----------+------------+
|       101|2022-05-10|         New|
|       102|2023-01-15|         New|
|       103|2021-11-01|       Loyal|
|       104|2020-07-22|       Loyal|
|       105|2023-03-10|         New|
+----------+----------+------------+

OrderType based on TotalAmount:
+-------+-----------+---------+
|OrderID|TotalAmount|OrderType|
+-------+-----------+---------+
|      1|   100000.0|     High|
|      2|     1200.0|      Low|
|      3|    20000.0|     High|
|      4|     3500.0|      Low|
|      5|     5000.0|     High|
|      6|     250

 # 4. Joins & Aggregations

In [12]:
from pyspark.sql.functions import sum as _sum, count as _count

# Join customers and orders on CustomerID
joined_df = customers_df.join(orders_df, on="CustomerID", how="inner")

# Total orders and revenue per city
joined_df.groupBy("City") \
    .agg(_count("OrderID").alias("TotalOrders"), _sum("TotalAmount").alias("TotalRevenue")) \
    .show()

# Top 3 customers by total spend
joined_df.groupBy("CustomerID", "Name") \
    .agg(_sum("TotalAmount").alias("TotalSpend")) \
    .orderBy("TotalSpend", ascending=False) \
    .show(3)

# Count of products sold per category
orders_df.groupBy("Category") \
    .agg(_sum("Quantity").alias("TotalProductsSold")) \
    .show()


+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|      3500.0|
|  Chennai|          1|      2500.0|
|   Mumbai|          2|    101200.0|
|  Unknown|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+

+----------+-----+----------+
|CustomerID| Name|TotalSpend|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+
only showing top 3 rows

+-----------+-----------------+
|   Category|TotalProductsSold|
+-----------+-----------------+
| Stationery|                5|
|Electronics|                5|
|  Furniture|                1|
| Appliances|                1|
+-----------+-----------------+



 # 5. Spark SQL Tasks

In [16]:
from pyspark.sql.functions import col, month

# Register temp views
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

# Create and use 'sales' database
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

# Add TotalAmount column to orders_df before saving
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# Save as tables in sales database
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

# 1. Orders by customers from “Delhi”
spark.sql("""
    SELECT o.*
    FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""").show()

#  2. Average order value in each category
spark.sql("""
    SELECT Category, ROUND(AVG(TotalAmount), 2) AS AvgOrderValue
    FROM sales.orders
    GROUP BY Category
""").show()

#  3. Create monthly_orders view
spark.sql("""
    CREATE OR REPLACE TEMP VIEW monthly_orders AS
    SELECT MONTH(OrderDate) AS OrderMonth,
           SUM(TotalAmount) AS TotalAmount
    FROM sales.orders
    GROUP BY MONTH(OrderDate)
""")

# Show monthly_orders view
spark.sql("SELECT * FROM monthly_orders ORDER BY OrderMonth").show()


+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

+----------+-----------+
|OrderMonth|TotalAmount|
+----------+-----------+
|         1|   101200.0|
|         2|    28500.0|
|         3|    32500.0|
+----------+-----------+



# 6. String & Date Functions

In [17]:
from pyspark.sql.functions import regexp_replace, concat_ws, datediff, current_date, month, date_format

# Mask emails using regex
masked_df = customers_df.withColumn("MaskedEmail", regexp_replace("Email", r"(^\w)[^@]*(@.*)", r"\1***\2"))
masked_df.select("CustomerID", "Email", "MaskedEmail").show()

# Concatenate Name and City
masked_df = masked_df.withColumn("NameFromCity", concat_ws(" from ", "Name", "City"))
masked_df.select("NameFromCity").show()

# Customer age in days
masked_df = masked_df.withColumn("CustomerAgeDays", datediff(current_date(), "SignupDate"))
masked_df.select("CustomerID", "SignupDate", "CustomerAgeDays").show()

# Extract month name from OrderDate
orders_df = orders_df.withColumn("MonthName", date_format("OrderDate", "MMMM"))
orders_df.select("OrderID", "OrderDate", "MonthName").show()


+----------+-----------------+-----------+
|CustomerID|            Email|MaskedEmail|
+----------+-----------------+-----------+
|       101|    ali@gmail.com|      1***2|
|       102|   neha@yahoo.com|      1***2|
|       103| ravi@hotmail.com|      1***2|
|       104|sneha@outlook.com|      1***2|
|       105|   amit@gmail.com|      1***2|
+----------+-----------------+-----------+

+--------------------+
|        NameFromCity|
+--------------------+
|     Ali from Mumbai|
|     Neha from Delhi|
| Ravi from Bangalore|
|Sneha from Hyderabad|
|   Amit from Chennai|
+--------------------+

+----------+----------+---------------+
|CustomerID|SignupDate|CustomerAgeDays|
+----------+----------+---------------+
|       101|2022-05-10|           1126|
|       102|2023-01-15|            876|
|       103|2021-11-01|           1316|
|       104|2020-07-22|           1783|
|       105|2023-03-10|            822|
+----------+----------+---------------+

+-------+----------+---------+
|OrderID| Or

# 7. UDFs and Complex Logic

In [18]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# UDF to tag customers by spend
def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(tag_customer, StringType())

# Calculate total spend and assign tag
customer_spend_df = joined_df.groupBy("CustomerID", "Name") \
    .agg(_sum("TotalAmount").alias("TotalSpend")) \
    .withColumn("CustomerTag", tag_udf("TotalSpend"))

customer_spend_df.show()

# UDF to shorten product names
def short_product(name):
    return name[:3] + "..." if name else None

short_udf = udf(short_product, StringType())

orders_df = orders_df.withColumn("ShortProduct", short_udf("Product"))
orders_df.select("Product", "ShortProduct").show()


+----------+-----+----------+-----------+
|CustomerID| Name|TotalSpend|CustomerTag|
+----------+-----+----------+-----------+
|       105| Amit|    2500.0|     Bronze|
|       104|Sneha|    5000.0|     Bronze|
|       101|  Ali|  101200.0|       Gold|
|       102| Neha|   50000.0|     Silver|
|       103| Ravi|    3500.0|     Bronze|
+----------+-----+----------+-----------+

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



# 8. Parquet & Views

In [19]:
# Save joined_df as Parquet
joined_df.write.mode("overwrite").parquet("joined_output.parquet")

# Read it back
parquet_df = spark.read.parquet("joined_output.parquet")
parquet_df.printSchema()
parquet_df.show()

# Create global temp view
parquet_df.createOrReplaceGlobalTempView("global_joined")

# Query global temp view
spark.sql("SELECT * FROM global_temp.global_joined LIMIT 5").show()

# Compare CSV vs Parquet read times (timing logic optional)
import time

start_csv = time.time()
_ = spark.read.option("header", True).csv("orders.csv").count()
end_csv = time.time()

start_parquet = time.time()
_ = spark.read.parquet("joined_output.parquet").count()
end_parquet = time.time()

print("CSV Read Time:", end_csv - start_csv)
print("Parquet Read Time:", end_parquet - start_parquet)


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)

+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|CustomerID| Name|     City|SignupDate|CustomerType|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+----------+-----+---------+----------+------------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|   