<a href="https://colab.research.google.com/github/Rohitnik2266/Data_Warehouse_Training/blob/main/June%209/PySpark_Assessment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkMasterTask").getOrCreate()


In [9]:
#1. Data Ingestion & Exploration
# Load CSVs with schema inference
customers_df = spark.read.option("header", True).option("inferSchema", True).csv("customers.csv")
orders_df = spark.read.option("header", True).option("inferSchema", True).csv("orders.csv")


In [10]:
customers_df.printSchema()
orders_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: string (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: string (nullable = true)



In [11]:
print("Total Customers:", customers_df.count())
print("Total Orders:", orders_df.count())

Total Customers: 5
Total Orders: 7


In [12]:
customers_df.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [17]:
#2. DataFrame Transformations
from pyspark.sql.functions import col, year

orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))

In [18]:
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))

In [19]:
orders_df.filter(col("TotalAmount") > 10000).show()

+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|Price|OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000|1/10/2024|     100000|     NULL|
|      3|       102| Tablet|Electronics|       1|20000| 2/1/2024|      20000|     NULL|
|      7|       102|  Phone|Electronics|       1|30000| 3/2/2024|      30000|     NULL|
+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+



In [20]:
customers_df = customers_df.drop("Email")

In [21]:
#3. Handling Nulls & Conditionals
from pyspark.sql.functions import when, lit, to_date

customers_df = customers_df.withColumn("City", when(col("CustomerID") == 105, None).otherwise(col("City")))


In [22]:
customers_df = customers_df.fillna({"City": "Unknown"})

In [23]:
customers_df = customers_df.withColumn("CustomerType", when(to_date("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))

In [24]:
orders_df = orders_df.withColumn(
    "OrderType",
    when(col("TotalAmount") < 5000, "Low").otherwise("High")
)

In [25]:
#4. Joins & Aggregations

joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")


In [26]:
from pyspark.sql.functions import sum as _sum, count as _count

joined_df.groupBy("City").agg(
    _count("OrderID").alias("TotalOrders"),
    _sum("TotalAmount").alias("TotalRevenue")
).show()

+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|        3500|
|   Mumbai|          2|      101200|
|  Unknown|          1|        2500|
|    Delhi|          2|       50000|
|Hyderabad|          1|        5000|
+---------+-----------+------------+



In [27]:
joined_df.groupBy("CustomerID", "Name").agg(_sum("TotalAmount").alias("TotalSpend"))\
    .orderBy(col("TotalSpend").desc()).limit(3).show()

+----------+-----+----------+
|CustomerID| Name|TotalSpend|
+----------+-----+----------+
|       101|  Ali|    101200|
|       102| Neha|     50000|
|       104|Sneha|      5000|
+----------+-----+----------+



In [28]:
orders_df.groupBy("Category").agg(_sum("Quantity").alias("ProductsSold")).show()

+-----------+------------+
|   Category|ProductsSold|
+-----------+------------+
| Stationery|           5|
|Electronics|           5|
|  Furniture|           1|
| Appliances|           1|
+-----------+------------+



In [30]:
#5. Spark SQL Tasks

spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

DataFrame[]

In [31]:
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [32]:
spark.sql("""
    SELECT * FROM orders o JOIN customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""").show()

spark.sql("""
    SELECT Category, AVG(TotalAmount) AS AvgOrderValue
    FROM orders
    GROUP BY Category
""").show()

spark.sql("""
    CREATE OR REPLACE VIEW monthly_orders AS
    SELECT MONTH(OrderDate) AS Month, SUM(TotalAmount) AS MonthlyTotal
    FROM orders
    GROUP BY MONTH(OrderDate)
""")
spark.sql("SELECT * FROM monthly_orders").show()

+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+---------+----------+----+-----+----------+------------+
|OrderID|CustomerID|Product|   Category|Quantity|Price|OrderDate|TotalAmount|OrderYear|OrderType|CustomerID|Name| City|SignupDate|CustomerType|
+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+---------+----------+----+-----+----------+------------+
|      3|       102| Tablet|Electronics|       1|20000| 2/1/2024|      20000|     NULL|     High|       102|Neha|Delhi| 1/15/2023|         New|
|      7|       102|  Phone|Electronics|       1|30000| 3/2/2024|      30000|     NULL|     High|       102|Neha|Delhi| 1/15/2023|         New|
+-------+----------+-------+-----------+--------+-----+---------+-----------+---------+---------+----------+----+-----+----------+------------+

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      3780

In [40]:
#String & Date Functions

from pyspark.sql import Row

# Recreate customers_df with Email column for masking
customers_with_email = spark.createDataFrame([
    Row(CustomerID=101, Name="Ali", Email="ali@gmail.com", City="Mumbai", SignupDate="2022-05-10"),
    Row(CustomerID=102, Name="Neha", Email="neha@yahoo.com", City="Delhi", SignupDate="2023-01-15"),
    Row(CustomerID=103, Name="Ravi", Email="ravi@hotmail.com", City="Bangalore", SignupDate="2021-11-01"),
    Row(CustomerID=104, Name="Sneha", Email="sneha@outlook.com", City="Hyderabad", SignupDate="2020-07-22"),
    Row(CustomerID=105, Name="Amit", Email="amit@gmail.com", City="Chennai", SignupDate="2023-03-10")
])


In [42]:

from pyspark.sql.functions import regexp_replace, concat_ws, datediff, current_date, month, date_format

customers_masked = customers_with_email.withColumn("MaskedEmail", regexp_replace("Email", r'(^.)(.*)(@.*)', r'\1***\3'))


In [43]:
customers_df = customers_df.withColumn("Description", concat_ws(" from ", "Name", "City"))


In [44]:
customers_df = customers_df.withColumn("AgeInDays", datediff(current_date(), to_date("SignupDate")))


In [45]:
orders_df = orders_df.withColumn("MonthName", date_format("OrderDate", "MMMM"))

In [46]:
#7. UDFs and Complex Logic
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(tag_customer, StringType())

customer_spend = joined_df.groupBy("CustomerID").agg(_sum("TotalAmount").alias("Spend"))
customer_spend = customer_spend.withColumn("Tag", tag_udf(col("Spend")))
customer_spend.show()


+----------+------+------+
|CustomerID| Spend|   Tag|
+----------+------+------+
|       101|101200|  Gold|
|       103|  3500|Bronze|
|       102| 50000|Silver|
|       105|  2500|Bronze|
|       104|  5000|Bronze|
+----------+------+------+



In [47]:
def shorten_name(name):
    return name[:3] + "..." if name else None

shorten_udf = udf(shorten_name, StringType())

orders_df = orders_df.withColumn("ShortProduct", shorten_udf(col("Product")))
orders_df.select("Product", "ShortProduct").show()

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



In [48]:
#Parquet & Views

joined_df.write.mode("overwrite").parquet("joined_orders.parquet")


In [49]:
parquet_df = spark.read.parquet("joined_orders.parquet")
parquet_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- TotalAmount: integer (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: string (nullable = true)
 |-- CustomerType: string (nullable = true)



In [50]:
parquet_df.createOrReplaceGlobalTempView("global_joined")


In [51]:
spark.sql("SELECT * FROM global_temp.global_joined LIMIT 5").show()


+----------+-------+---------+-----------+--------+-----+---------+-----------+---------+---------+-----+---------+----------+------------+
|CustomerID|OrderID|  Product|   Category|Quantity|Price|OrderDate|TotalAmount|OrderYear|OrderType| Name|     City|SignupDate|CustomerType|
+----------+-------+---------+-----------+--------+-----+---------+-----------+---------+---------+-----+---------+----------+------------+
|       101|      1|   Laptop|Electronics|       2|50000|1/10/2024|     100000|     NULL|     High|  Ali|   Mumbai| 5/10/2022|         New|
|       101|      2|    Mouse|Electronics|       1| 1200|1/15/2024|       1200|     NULL|      Low|  Ali|   Mumbai| 5/10/2022|         New|
|       102|      3|   Tablet|Electronics|       1|20000| 2/1/2024|      20000|     NULL|     High| Neha|    Delhi| 1/15/2023|         New|
|       103|      4|Bookshelf|  Furniture|       1| 3500|2/10/2024|       3500|     NULL|      Low| Ravi|Bangalore| 11/1/2021|         New|
|       104|      5|

In [52]:
import time

start = time.time()
spark.read.option("inferSchema", True).csv("orders.csv").count()
print("CSV Read Time:", time.time() - start)

start = time.time()
spark.read.parquet("joined_orders.parquet").count()
print("Parquet Read Time:", time.time() - start)

CSV Read Time: 1.013702630996704
Parquet Read Time: 0.5349557399749756
