#**Assignment-2**

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("dbshell-01").getOrCreate()

# **Basics**

In [0]:
# 1. Load retail_data.csv into a PySpark DataFrame and display schema.
dfRet = spark.read.csv("/FileStore/tables/retail_data.csv", header=True, inferSchema=True)
dfRet.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionData: date (nullable = true)



In [0]:
# 2. Infer schema as False — then manually cast columns.
schema = StructType(
    [
        StructField("TransactionID", StringType()),
        StructField("Customer", StringType()),
        StructField("City", StringType()),
        StructField("Product", StringType()),
        StructField("Category", StringType()),
        StructField("Quantity", IntegerType()),
        StructField("UnitPrice", IntegerType()),
        StructField("TotalPrice", IntegerType()),
        StructField("TransactionData", DateType())
    ]
    )
dfschema = spark.read.csv("/FileStore/tables/retail_data.csv", header=True, schema=schema)
dfschema.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionData: date (nullable = true)



# **Data Exploration & Filtering**

In [0]:
# 3. Filter transactions where TotalPrice > 40000 .
dfRet.filter(dfRet.TotalPrice > 40_000).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionData|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+



In [0]:
# 4. Get unique cities from the dataset.
dfRet.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [0]:
# 5. Find all transactions from "Delhi" using .filter() and .where() .
dfRet.filter(dfRet.City == "Delhi").show()
dfRet.where(dfRet.City == "Delhi").show()


+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionData|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|
|        T1006|   Farah|Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionData|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|
|        T1006|   Farah|Delhi

# **Data Manipulation**

In [0]:
# 6. Add a column DiscountedPrice = TotalPrice - 10%.
dfRet.withColumn("DiscountedPrice", dfRet.TotalPrice - (dfRet.TotalPrice * 0.10)).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionData|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3| 

In [0]:
# 7. Rename TransactionDate to TxnDate .
dfRet = dfRet.withColumnRenamed("TransactionData", "TxnDate")
dfRet.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+



In [0]:
# 8. Drop the column UnitPrice .
dfRet.drop("UnitPrice").show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|
+-------------+--------+---------+-------+-----------+--------+----------+----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|     60000|2024-01-20|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|2024-02-15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|
+-------------+--------+---------+-------+-----------+--------+----------+----------+



# **Aggregations**

In [0]:
# 9. Get total sales by city.
dfRet.groupBy("City").agg(
    F.sum("TotalPrice").alias("TotalSales")
).show()

+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|   Mumbai|    120000|
|    Delhi|     23000|
|Hyderabad|     15000|
+---------+----------+



In [0]:
# 10. Get average unit price by category.
dfRet.groupBy("Category").agg(
    F.round(F.mean("UnitPrice"), 2).alias("AverageUnitPrice")
).show()

+-----------+----------------+
|   Category|AverageUnitPrice|
+-----------+----------------+
|Electronics|         37750.0|
|  Furniture|         10000.0|
+-----------+----------------+



In [0]:
# 11. Count of transactions grouped by PaymentMode.
dfRet.groupBy("PaymentMode").agg(
    F.count("*").alias("TransactionCount")
).show()

+----------+----------------+
|   TxnDate|TransactionCount|
+----------+----------------+
|2024-02-15|               1|
|2024-02-10|               1|
|2024-02-12|               1|
|2024-01-15|               1|
|2024-02-18|               1|
|2024-01-20|               1|
+----------+----------------+



# **Window Functions**

In [0]:
# 12. Use a window partitioned by City to rank transactions by TotalPrice .
win = W.partitionBy("City").orderBy(F.desc("TotalPrice"))
dfRet.withColumn("Rank", F.rank().over(win)).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+----+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|Rank|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+----+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|   1|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|   1|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|   2|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|   1|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|   1|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|   2|
+-------------+--------+---------+-------+-----------+--------+---------+----------+-------

In [0]:
# 13. Use lag function to get previous transaction amount per city.
win2 = W.partitionBy("City").orderBy("TotalPrice")

dfRet.withColumn("PreviousTransaction", F.lag("TotalPrice", 1, 0).over(win2)).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-------------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PreviousTransaction|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-------------------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|                  0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|                  0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|               3000|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|                  0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|                  0|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000

# **Joins**

In [0]:
# 14. Create a second DataFrame city_region :
data = [("Mumbai", "West"), ("Delhi", "North"), ("Banglore", "South"), ("Hyderabad", "South")]
columns = ["City", "Region"]
city_region = spark.createDataFrame(data, columns)
city_region.show()

+---------+------+
|     City|Region|
+---------+------+
|   Mumbai|  West|
|    Delhi| North|
| Banglore| South|
|Hyderabad| South|
+---------+------+



In [0]:
# 15. Join with main DataFrame and group total sales by Region.
dfJoined = dfRet.join(city_region, on="City", how="inner")
dfJoined.groupBy("Region").agg(
    F.sum("TotalPrice").alias("TotalSales")
).show()

+------+----------+
|Region|TotalSales|
+------+----------+
|  West|    120000|
| North|     23000|
| South|     15000|
+------+----------+



# **Nulls and Data Cleaning**

In [0]:
# 16. Introduce some nulls and replace them with default values.
dfNulls = dfRet.withColumn("Category", F.when(F.col("Category") == "Furniture", None).otherwise(F.col("Category")))

dfNulls.fillna("UnknownCategory", subset="Category").show()

+-------------+--------+---------+-------+---------------+--------+---------+----------+----------+
|TransactionID|Customer|     City|Product|       Category|Quantity|UnitPrice|TotalPrice|   TxnDate|
+-------------+--------+---------+-------+---------------+--------+---------+----------+----------+
|        T1001|     Ali|   Mumbai| Laptop|    Electronics|       1|    70000|     70000|2024-01-15|
|        T1002|    Neha|Bangalore| Tablet|    Electronics|       2|    30000|     60000|2024-01-20|
|        T1003|    Ravi|Hyderabad|   Desk|UnknownCategory|       1|    15000|     15000|2024-02-10|
|        T1004|    Zoya|    Delhi|  Chair|UnknownCategory|       4|     5000|     20000|2024-02-12|
|        T1005|   Karan|   Mumbai|  Phone|    Electronics|       1|    50000|     50000|2024-02-15|
|        T1006|   Farah|    Delhi|  Mouse|    Electronics|       3|     1000|      3000|2024-02-18|
+-------------+--------+---------+-------+---------------+--------+---------+----------+----------+


In [0]:
# 17. Drop rows where Quantity is null.
dfNulls.dropna(subset="Quantity")

In [0]:
# 18. Fill null PaymentMode with "Unknown".
dfNulls.fillna("UnKnown", subset="PaymentMode")

# **Custom Functions**

In [0]:
# 19. Write a UDF to label orders:
def labelOrder(amount):
    if amount> 50_000:
        return "High"
    if amount >= 30_000:
        return "Medium"
    return "Low"

labeler = F.udf(labelOrder, StringType())
dfRet.withColumn("Label", labeler(dfRet.TotalPrice)).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate| Label|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|  High|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|  High|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|   Low|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|   Low|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|Medium|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|   Low|
+-------------+--------+---------+-------+-----------+--------+---------+

# **Date & Time**

In [0]:
# 20. Extract year, month, and day from TxnDate .
dfRet.select("TxnDate") \
  .withColumn("Year", F.year(F.col("TxnDate"))) \
  .withColumn("Month", F.month(F.col("TxnDate"))) \
  .withColumn("Day", F.dayofmonth(F.col("TxnDate"))) \
  .show()

+----------+----+-----+---+
|   TxnDate|Year|Month|Day|
+----------+----+-----+---+
|2024-01-15|2024|    1| 15|
|2024-01-20|2024|    1| 20|
|2024-02-10|2024|    2| 10|
|2024-02-12|2024|    2| 12|
|2024-02-15|2024|    2| 15|
|2024-02-18|2024|    2| 18|
+----------+----+-----+---+



In [0]:
# 21. Filter transactions that happened in February.
dfRet.filter(F.month(F.col("TxnDate")) == 2).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+



# **Union & Duplicate Handling**

In [0]:
# 22. Duplicate the DataFrame using union() and remove duplicates.
dfNew = dfRet.union(dfRet)
dfNew = dfNew.dropDuplicates()
dfNew.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|2024-01-15|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|2024-02-10|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|2024-01-20|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|2024-02-18|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|2024-02-12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|2024-02-15|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+

