In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("RetailTransactions").getOrCreate()
df = spark.read.option("header", True).option("inferSchema", True).csv("retail_data.csv")
df.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)



In [3]:
df_raw = spark.read.option("header", True).option("inferSchema", False).csv("retail_data.csv")
df_casted = df_raw.select(
    col("TransactionID"),
    col("Customer"),
    col("City"),
    col("Product"),
    col("Category"),
    col("Quantity").cast("int"),
    col("UnitPrice").cast("int"),
    col("TotalPrice").cast("int"),
    to_date("TransactionDate", "yyyy-MM-dd").alias("TransactionDate"),
    col("PaymentMode")
)

Data Exploration & Filtering

In [4]:
df_casted.filter(col("TotalPrice") > 40000).show()


+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+



In [5]:
df_casted.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [6]:
df_casted.filter(col("City") == "Delhi").show()
df_casted.where(col("City") == "Delhi").show()

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1006|   Farah|Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|   

Data Manipulation

In [7]:
df_discounted = df_casted.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)
df_discounted.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-

In [8]:
df_renamed = df_discounted.withColumnRenamed("TransactionDate", "TxnDate")


In [9]:
df_dropped = df_renamed.drop("UnitPrice")

Aggregations

In [10]:
df_dropped.groupBy("City").agg(sum("TotalPrice").alias("TotalSales")).show()


+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|   Mumbai|    120000|
|    Delhi|     23000|
|Hyderabad|     15000|
+---------+----------+



In [11]:
df_casted.groupBy("Category").agg(avg("UnitPrice").alias("AvgUnitPrice")).show()


+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|Electronics|     37750.0|
|  Furniture|     10000.0|
+-----------+------------+



In [12]:
df_casted.groupBy("PaymentMode").count().show()


+-----------+-----+
|PaymentMode|count|
+-----------+-----+
|Net Banking|    1|
|       Card|    3|
|       Cash|    1|
|        UPI|    1|
+-----------+-----+



Window Functions

In [13]:
windowSpec = Window.partitionBy("City").orderBy(col("TotalPrice").desc())
df_ranked = df_casted.withColumn("Rank", rank().over(windowSpec))
df_ranked.select("TransactionID", "City", "TotalPrice", "Rank").show()

+-------------+---------+----------+----+
|TransactionID|     City|TotalPrice|Rank|
+-------------+---------+----------+----+
|        T1002|Bangalore|     60000|   1|
|        T1004|    Delhi|     20000|   1|
|        T1006|    Delhi|      3000|   2|
|        T1003|Hyderabad|     15000|   1|
|        T1001|   Mumbai|     70000|   1|
|        T1005|   Mumbai|     50000|   2|
+-------------+---------+----------+----+



In [14]:
df_lagged = df_casted.withColumn("PrevPrice", lag("TotalPrice").over(windowSpec))
df_lagged.select("TransactionID", "City", "TotalPrice", "PrevPrice").show()


+-------------+---------+----------+---------+
|TransactionID|     City|TotalPrice|PrevPrice|
+-------------+---------+----------+---------+
|        T1002|Bangalore|     60000|     NULL|
|        T1004|    Delhi|     20000|     NULL|
|        T1006|    Delhi|      3000|    20000|
|        T1003|Hyderabad|     15000|     NULL|
|        T1001|   Mumbai|     70000|     NULL|
|        T1005|   Mumbai|     50000|    70000|
+-------------+---------+----------+---------+



Joins

In [15]:
region_data = [
    ("Mumbai", "West"),
    ("Delhi", "North"),
    ("Bangalore", "South"),
    ("Hyderabad", "South")
]
region_df = spark.createDataFrame(region_data, ["City", "Region"])


In [16]:
joined_df = df_casted.join(region_df, on="City", how="left")
joined_df.groupBy("Region").agg(sum("TotalPrice").alias("RegionSales")).show()


+------+-----------+
|Region|RegionSales|
+------+-----------+
| South|      75000|
|  West|     120000|
| North|      23000|
+------+-----------+



Nulls and Data Cleaning

In [17]:
df_null = df_casted.withColumn("Quantity", when(col("TransactionID") == "T1003", None).otherwise(col("Quantity")))
df_filled = df_null.fillna({"Quantity": 1})
df_filled.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02

In [18]:
df_dropped_nulls = df_null.dropna(subset=["Quantity"])
df_dropped_nulls.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|
+-------------+--------+---------+-------+-----------+--------+---------+----------+------------

In [19]:
df_null_payment = df_casted.withColumn("PaymentMode", when(col("TransactionID") == "T1005", None).otherwise(col("PaymentMode")))
df_filled_payment = df_null_payment.fillna({"PaymentMode": "Unknown"})
df_filled_payment.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|    Unknown|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02

Custom Functions

In [20]:
from pyspark.sql.functions import udf

def label_order(amount):
    if amount > 50000:
        return "High"
    elif amount >= 30000:
        return "Medium"
    else:
        return "Low"

label_udf = udf(label_order, StringType())
df_labeled = df_casted.withColumn("OrderLabel", label_udf(col("TotalPrice")))
df_labeled.select("TransactionID", "TotalPrice", "OrderLabel").show()

+-------------+----------+----------+
|TransactionID|TotalPrice|OrderLabel|
+-------------+----------+----------+
|        T1001|     70000|      High|
|        T1002|     60000|      High|
|        T1003|     15000|       Low|
|        T1004|     20000|       Low|
|        T1005|     50000|    Medium|
|        T1006|      3000|       Low|
+-------------+----------+----------+



Date & Time

In [21]:
df_date_parts = df_casted.withColumn("Year", year("TransactionDate")) \
                         .withColumn("Month", month("TransactionDate")) \
                         .withColumn("Day", dayofmonth("TransactionDate"))
df_date_parts.select("TransactionID", "Year", "Month", "Day").show()


+-------------+----+-----+---+
|TransactionID|Year|Month|Day|
+-------------+----+-----+---+
|        T1001|2024|    1| 15|
|        T1002|2024|    1| 20|
|        T1003|2024|    2| 10|
|        T1004|2024|    2| 12|
|        T1005|2024|    2| 15|
|        T1006|2024|    2| 18|
+-------------+----+-----+---+



In [22]:
df_feb = df_date_parts.filter(month("TransactionDate") == 2)
df_feb.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+----+-----+---+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|2024|    2| 12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|2024|    2| 15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02-18|       Cash|2024|    2| 18|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+----

Union & Duplicate Handling

In [23]:
df_union = df_casted.union(df_casted)
df_deduped = df_union.dropDuplicates()
print("Original count:", df_casted.count())
print("After union:", df_union.count())
print("After deduplication:", df_deduped.count())

Original count: 6
After union: 12
After deduplication: 6
