In [None]:
with open("customers.csv", "w") as f:
    f.write("""CustomerID,Name,Email,City,SignupDate
101,Ali,ali@gmail.com,Mumbai,2022-05-10
102,Neha,neha@yahoo.com,Delhi,2023-01-15
103,Ravi,ravi@hotmail.com,Bangalore,2021-11-01
104,Sneha,sneha@outlook.com,Hyderabad,2020-07-22
105,Amit,amit@gmail.com,Chennai,2023-03-10""")

with open("orders.csv", "w") as f:
    f.write("""OrderID,CustomerID,Product,Category,Quantity,Price,OrderDate
1,101,Laptop,Electronics,2,50000.0,2024-01-10
2,101,Mouse,Electronics,1,1200.0,2024-01-15
3,102,Tablet,Electronics,1,20000.0,2024-02-01
4,103,Bookshelf,Furniture,1,3500.0,2024-02-10
5,104,Mixer,Appliances,1,5000.0,2024-02-15
6,105,Notebook,Stationery,5,500.0,2024-03-01
7,102,Phone,Electronics,1,30000.0,2024-03-02""")


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


spark = SparkSession.builder.appName("CustomerOrders").getOrCreate()

# Load both CSV files with schema inference
customers_df = spark.read.csv("customers.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)

# List all columns and data types
print("Customers Schema:")
customers_df.printSchema()

print("Orders Schema:")
orders_df.printSchema()

# Count total number of customers and orders

print("\nTotal Customers:", customers_df.count())
print("Total Orders:", orders_df.count())

# Show distinct cities
print("\nDistinct Cities:")
customers_df.select("City").distinct().show()


Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)


Total Customers: 5
Total Orders: 7

Distinct Cities:
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [7]:
#2. DataFrame Transformations
from pyspark.sql.functions import year

# Add a column TotalAmount = Price * Quantity
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

# Create a new column OrderYear from OrderDate
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
orders_df.show()

# Filter orders with TotalAmount > 10,000
filter_orders_df = orders_df.filter(col("TotalAmount") > 10000)
print("High-Value Orders:")
filter_orders_df.show()

# Drop the Email column from customers
customersDf = customers_df.drop("Email")
print("Customers without Email:")
customersDf.show()


+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [41]:
#3. Handling Nulls & Conditionals
from pyspark.sql.functions import when, col, lit

# Simulate a null in City and fill with "Unknown"
customers_city = customers_df.withColumn(
    "City",when(col("SignupDate") < "2022-05-10", lit(None)).otherwise(col("City"))
)
customers = customers_city.fillna({"City": "Unknown"})
customers.show()

# Label customers as “Loyal” if SignupDate < 2022, else “New”
customers_label = customers.withColumn(
    "CustomerType", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New")
)
customers_label.show()

# Create OrderType column: "Low" if TotalAmount < 5000, else "High"
# orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))

orders_type = orders_df.withColumn(
    "OrderType",when(col("TotalAmount") < 5000, "Low").otherwise("High")
)
orders_type.show()


+----------+-----+-----------------+-------+----------+
|CustomerID| Name|            Email|   City|SignupDate|
+----------+-----+-----------------+-------+----------+
|       101|  Ali|    ali@gmail.com| Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|  Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Unknown|2021-11-01|
|       104|Sneha|sneha@outlook.com|Unknown|2020-07-22|
|       105| Amit|   amit@gmail.com|Chennai|2023-03-10|
+----------+-----+-----------------+-------+----------+

+----------+-----+-----------------+-------+----------+------------+
|CustomerID| Name|            Email|   City|SignupDate|CustomerType|
+----------+-----+-----------------+-------+----------+------------+
|       101|  Ali|    ali@gmail.com| Mumbai|2022-05-10|         New|
|       102| Neha|   neha@yahoo.com|  Delhi|2023-01-15|         New|
|       103| Ravi| ravi@hotmail.com|Unknown|2021-11-01|       Loyal|
|       104|Sneha|sneha@outlook.com|Unknown|2020-07-22|       Loyal|
|       105|

In [16]:
#4.Joins & Aggregations

# Join customers and orders on CustomerID
joined_df = customers_df.join(orders_df, on="CustomerID", how="inner")

# Total orders and revenue per city

order = joined_df.groupBy("City").agg(
    {"OrderID": "count", "TotalAmount": "sum"}
).withColumnRenamed("count(OrderID)", "TotalOrders") \
 .withColumnRenamed("sum(TotalAmount)", "TotalRevenue")
order.show()

# Show top 3 customers by total spend

top_customers = joined_df.groupBy("CustomerID", "Name").agg(
    {"TotalAmount": "sum"}
).withColumnRenamed("sum(TotalAmount)", "TotalSpend") \
 .orderBy(col("TotalSpend").desc()) \
 .limit(3)
top_customers.show()

# Count how many products each category has sold

category_sales = orders_df.groupBy("Category").agg(
    {"Quantity": "sum"}
).withColumnRenamed("sum(Quantity)", "ProductsSold")
category_sales.show()

+---------+------------+-----------+
|     City|TotalRevenue|TotalOrders|
+---------+------------+-----------+
|Bangalore|      3500.0|          1|
|  Chennai|      2500.0|          1|
|   Mumbai|    101200.0|          2|
|    Delhi|     50000.0|          2|
|Hyderabad|      5000.0|          1|
+---------+------------+-----------+

+----------+-----+----------+
|CustomerID| Name|TotalSpend|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+

+-----------+------------+
|   Category|ProductsSold|
+-----------+------------+
| Stationery|           5|
|Electronics|           5|
|  Furniture|           1|
| Appliances|           1|
+-----------+------------+



In [22]:
#5. Spark SQL Tasks
# Create database and switch to it
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

# Save both datasets as tables in the sales database
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")


+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



In [24]:
# List all orders by customers from “Delhi”
spark.sql("""
SELECT o.*
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE c.City = 'Delhi'
""").show()

# Find average order value in each category
spark.sql("""
SELECT Category, ROUND(AVG(TotalAmount), 2) AS AvgOrderValue
FROM orders
GROUP BY Category
""").show()

# Create a view `monthly_orders` with month-wise total amount
# Create a TEMPORARY view instead of a persistent one
spark.sql("""
CREATE OR REPLACE TEMP VIEW monthly_orders AS
SELECT
    DATE_FORMAT(OrderDate, 'yyyy-MM') AS Month,
    SUM(TotalAmount) AS TotalMonthlyAmount
FROM orders
GROUP BY DATE_FORMAT(OrderDate, 'yyyy-MM')
""")

spark.sql("SELECT * FROM monthly_orders").show()


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+

+-------+------------------+
|  Month|TotalMonthlyAmount|
+-------+------------------+
|2024-02|           28500.0|
|2024-03|           32500.0|
|2024-01|          101200.0|
+-------+------------------+



In [31]:
#6. String & Date Functions

from pyspark.sql.functions import regexp_replace

#Mask emails using regex (e.g., a***@gmail.com ).
customers_email = customers_df.withColumn(
    "MaskedEmail",regexp_replace("Email", r"(^.).*(@.*)", r"\1***\2")
)
customers_email.show()

# Concatenate Name and City as “Name from City”

from pyspark.sql.functions import concat_ws, col
customers = customers.withColumn(
    "NameFromCity",concat_ws(" from ", col("Name"), col("City"))
)
customers.show()

# Use datediff() to calculate customer age in days.

from pyspark.sql.functions import datediff, current_date, to_date

customers = customers.withColumn(
    "CustomerAgeDays",datediff(current_date(), to_date(col("SignupDate")))
)
customers.show()

# Extract month name from OrderDate
from pyspark.sql.functions import date_format, to_date

orders_with_month = orders_df.withColumn(
    "OrderMonthName",
    date_format(to_date(col("OrderDate")), "MMMM")
)

orders_with_month.select("OrderId","OrderDate", "OrderMonthName").show()

+----------+-----+-----------------+---------+----------+-----------+
|CustomerID| Name|            Email|     City|SignupDate|MaskedEmail|
+----------+-----+-----------------+---------+----------+-----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      1***2|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|      1***2|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|      1***2|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|      1***2|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|      1***2|
+----------+-----+-----------------+---------+----------+-----------+

+----------+-----+-----------------+-------+----------+------------------+---------------+
|CustomerID| Name|            Email|   City|SignupDate|      NameFromCity|CustomerAgeDays|
+----------+-----+-----------------+-------+----------+------------------+---------------+
|       101|  Ali|    ali@gmail.com| Mumbai|2022-05-10|   Ali from Mumbai|           1126|
|    

In [34]:
# 7.UDFs and Complex Logic
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Write a UDF to tag customers:
# “Gold” if spend >
# 50K, “Silver” if 10K–50K, “Bronze” if <10K.

def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(tag_customer, StringType())

customer_spend_df = orders_df.groupBy("CustomerID").sum("TotalAmount") \
    .withColumnRenamed("sum(TotalAmount)", "TotalSpend")

tag_customers_df = customer_spend_df.withColumn("Tag", tag_udf(col("TotalSpend")))

tag_customers_df.show()


+----------+----------+------+
|CustomerID|TotalSpend|   Tag|
+----------+----------+------+
|       101|  101200.0|  Gold|
|       103|    3500.0|Bronze|
|       102|   50000.0|Silver|
|       105|    2500.0|Bronze|
|       104|    5000.0|Bronze|
+----------+----------+------+



In [35]:
def shorten_name(name):
    return name[:3] + "..." if len(name) > 3 else name

shorten_udf = udf(shorten_name, StringType())

orders_shorten= orders_df.withColumn("ShortProduct", shorten_udf(col("Product")))

orders_shorten.select("Product", "ShortProduct").show()


+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



In [42]:
# 8.Parquet & Views
# Save the joined result as a Parquet file.
joined_df = customers_df.join(orders_df, on="CustomerID", how="inner")

joined_df.write.mode("overwrite").parquet("joined_orders.parquet")


In [43]:
# Read it back and verify schema.
parquet_df = spark.read.parquet("joined_orders.parquet")
parquet_df.printSchema()
parquet_df.show()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)

+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+
|CustomerID| Name|            Email|     City|SignupDate|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+----------+-----+-----------------+---------+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|      1|  

In [44]:
# Create and query a global temp view.
parquet_df.createOrReplaceGlobalTempView("global_joined_orders")

spark.sql("select * from global_temp.global_joined_orders where TotalAmount > 10000").show()


+----------+----+--------------+------+----------+-------+-------+-----------+--------+-------+----------+-----------+---------+
|CustomerID|Name|         Email|  City|SignupDate|OrderID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+----------+----+--------------+------+----------+-------+-------+-----------+--------+-------+----------+-----------+---------+
|       101| Ali| ali@gmail.com|Mumbai|2022-05-10|      1| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|       102|Neha|neha@yahoo.com| Delhi|2023-01-15|      3| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|       102|Neha|neha@yahoo.com| Delhi|2023-01-15|      7|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+----------+----+--------------+------+----------+-------+-------+-----------+--------+-------+----------+-----------+---------+



In [45]:
import time

# Compare performance between CSV read and Parquet read.
start_csv = time.time()
csv_df = spark.read.csv("orders.csv", header=True, inferSchema=True)
csv_df.count()
end_csv = time.time()

start_parquet = time.time()
parquet_df = spark.read.parquet("joined_orders.parquet")
parquet_df.count()
end_parquet = time.time()

print(f"CSV Read Time: {end_csv - start_csv:.4f} sec")
print(f"Parquet Read Time: {end_parquet - start_parquet:.4f} sec")


CSV Read Time: 1.5067 sec
Parquet Read Time: 0.7294 sec
