In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Retail-sales_analysis")
    .master("local[*]")
    .getOrCreate() 
)


In [2]:
spark

In [33]:
df = spark.read.csv("file:///D:/Apache_spark/datasets/retail_sales_dataset.csv", header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- Transaction ID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Product Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price per Unit: integer (nullable = true)
 |-- Total Amount: integer (nullable = true)



In [5]:
df.rdd.getNumPartitions()

1

In [8]:
df.show(10)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|Transaction ID|      Date|Customer ID|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|
|             6|2023-04-25|    CUST006|Female| 45|          Beauty|       1|            30|          30|
|             7|2023-03-13|    CUST007|  Male| 46|     

In [34]:
df = df.toDF("Transaction_Id", "Date", "Customer_Id", "Gender", "Age", "Product_Category", "Quantity", "Price_Per_unit", "Total_Amount")

In [35]:
df.printSchema()

root
 |-- Transaction_Id: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Customer_Id: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price_Per_unit: integer (nullable = true)
 |-- Total_Amount: integer (nullable = true)



In [11]:
df.schema

StructType([StructField('Transaction_Id', IntegerType(), True), StructField('Date', DateType(), True), StructField('Customer_Id', StringType(), True), StructField('Gender', StringType(), True), StructField('Age', IntegerType(), True), StructField('Product_Category', StringType(), True), StructField('Quantity', IntegerType(), True), StructField('Price_Per_unit', IntegerType(), True), StructField('Total_Amount', IntegerType(), True)])

In [36]:
### **1. Date Transformations**
#- **Extract Year, Month, Day:** 
from pyspark.sql.functions import year, month, dayofmonth

df = df.withColumn("Year", year(df["Date"]))
df = df.withColumn("Month", month(df["Date"]))
df = df.withColumn("Day", dayofmonth(df["Date"]))


In [13]:
df.take(10)

[Row(Transaction_Id=1, Date=datetime.date(2023, 11, 24), Customer_Id='CUST001', Gender='Male', Age=34, Product_Category='Beauty', Quantity=3, Price_Per_unit=50, Total_Amount=150, Year=2023, Month=11, Day=24),
 Row(Transaction_Id=2, Date=datetime.date(2023, 2, 27), Customer_Id='CUST002', Gender='Female', Age=26, Product_Category='Clothing', Quantity=2, Price_Per_unit=500, Total_Amount=1000, Year=2023, Month=2, Day=27),
 Row(Transaction_Id=3, Date=datetime.date(2023, 1, 13), Customer_Id='CUST003', Gender='Male', Age=50, Product_Category='Electronics', Quantity=1, Price_Per_unit=30, Total_Amount=30, Year=2023, Month=1, Day=13),
 Row(Transaction_Id=4, Date=datetime.date(2023, 5, 21), Customer_Id='CUST004', Gender='Male', Age=37, Product_Category='Clothing', Quantity=1, Price_Per_unit=500, Total_Amount=500, Year=2023, Month=5, Day=21),
 Row(Transaction_Id=5, Date=datetime.date(2023, 5, 6), Customer_Id='CUST005', Gender='Male', Age=30, Product_Category='Beauty', Quantity=2, Price_Per_unit=50

In [33]:
df.limit(10).collect()

[Row(Transaction_Id=1, Date=datetime.date(2023, 11, 24), Customer_Id='CUST001', Gender='Male', Age=34, Product_Category='Beauty', Quantity=3, Price_Per_unit=50, Total_Amount=150, Year=2023, Month=11, Day=24),
 Row(Transaction_Id=2, Date=datetime.date(2023, 2, 27), Customer_Id='CUST002', Gender='Female', Age=26, Product_Category='Clothing', Quantity=2, Price_Per_unit=500, Total_Amount=1000, Year=2023, Month=2, Day=27),
 Row(Transaction_Id=3, Date=datetime.date(2023, 1, 13), Customer_Id='CUST003', Gender='Male', Age=50, Product_Category='Electronics', Quantity=1, Price_Per_unit=30, Total_Amount=30, Year=2023, Month=1, Day=13),
 Row(Transaction_Id=4, Date=datetime.date(2023, 5, 21), Customer_Id='CUST004', Gender='Male', Age=37, Product_Category='Clothing', Quantity=1, Price_Per_unit=500, Total_Amount=500, Year=2023, Month=5, Day=21),
 Row(Transaction_Id=5, Date=datetime.date(2023, 5, 6), Customer_Id='CUST005', Gender='Male', Age=30, Product_Category='Beauty', Quantity=2, Price_Per_unit=50

In [37]:
from pyspark.sql.functions import when, col

df = df.withColumn("Age_Group", when(col("Age") < 20, "Teenage")
                                .when((col("Age") >= 20) & (col("Age") <= 40), "Yougnger_Adult")
                                .when((col("Age") >= 40) & (col("age") < 60), "Middle_Aged")
                                .otherwise("Senior"))


In [38]:
df.show()

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|2023|    1| 13|   Middle_Aged|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|2023|    5| 21|Yougnger_Adult|
|             5|2023-05-06|    CUST005|  Male| 30|     

In [39]:
### **6. Price Normalization**
from pyspark.sql.functions import col, min, max

min_price = df.select(min(col("Price_Per_unit"))).collect()[0][0]
max_price = df.select(max(col("Price_Per_unit"))).collect()[0][0]

df = df.withColumn("Price_Normalized", (col("Price_Per_unit") - min_price) / (max_price - min_price))

df.show(10, truncate=False)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+--------------------+
|Transaction_Id|Date      |Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|Age_Group     |Price_Normalized    |
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+--------------------+
|1             |2023-11-24|CUST001    |Male  |34 |Beauty          |3       |50            |150         |2023|11   |24 |Yougnger_Adult|0.05263157894736842 |
|2             |2023-02-27|CUST002    |Female|26 |Clothing        |2       |500           |1000        |2023|2    |27 |Yougnger_Adult|1.0                 |
|3             |2023-01-13|CUST003    |Male  |50 |Electronics     |1       |30            |30          |2023|1    |13 |Middle_Aged   |0.010526315789473684|
|4             |2023-05-21|CUST004    |Male  |37 |Clothing      

In [41]:
df.show(10, truncate=False)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+--------------------+
|Transaction_Id|Date      |Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|Age_Group     |Price_Normalized    |
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+--------------------+
|1             |2023-11-24|CUST001    |Male  |34 |Beauty          |3       |50            |150         |2023|11   |24 |Yougnger_Adult|0.05263157894736842 |
|2             |2023-02-27|CUST002    |Female|26 |Clothing        |2       |500           |1000        |2023|2    |27 |Yougnger_Adult|1.0                 |
|3             |2023-01-13|CUST003    |Male  |50 |Electronics     |1       |30            |30          |2023|1    |13 |Middle_Aged   |0.010526315789473684|
|4             |2023-05-21|CUST004    |Male  |37 |Clothing      

In [40]:
from pyspark.sql.functions import round, col

df = df.withColumn("Price_Normalized", round(col("Price_Normalized"), 2))

In [41]:
df.show(10)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|2023|    1| 13|   Middle_Aged|            0.01|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           5

In [20]:
df = df.drop("Rounded_Value")

In [21]:
df.show(10)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|2023|    1| 13|   Middle_Aged|            0.01|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           5

In [42]:
from pyspark.sql.functions import col

df = df.withColumn("Total_Amount", col("Quantity") * col("Price_per_Unit"))

df.show(10)


+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|2023|    1| 13|   Middle_Aged|            0.01|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           5

In [27]:
from pyspark.sql.functions import col

df.select(col("Customer_Id")).distinct().count()

1000

In [43]:
### **1. Window Functions for Advanced Analytics**

from pyspark.sql.window import Window
from pyspark.sql.functions import avg

window_spec = Window.partitionBy("Customer_Id").orderBy("Date").rowsBetween(-2, 0)
df = df.withColumn("Moving_Avg_Sales", avg("Total_Amount").over(window_spec))

df.show(10)


+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|Moving_Avg_Sales|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|           150.0|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|          1000.0|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|2023|    1| 13|   Middle_Aged|            0.01|    

In [16]:
df.show(10, truncate=False)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+
|Transaction_Id|Date      |Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|Age_Group     |Price_Normalized|Moving_Avg_Sales|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+
|1             |2023-11-24|CUST001    |Male  |34 |Beauty          |3       |50            |150         |2023|11   |24 |Yougnger_Adult|0.05            |150.0           |
|2             |2023-02-27|CUST002    |Female|26 |Clothing        |2       |500           |1000        |2023|2    |27 |Yougnger_Adult|1.0             |1000.0          |
|3             |2023-01-13|CUST003    |Male  |50 |Electronics     |1       |30            |30          |2023|1    |13 |Middle_Aged   |0.01            |30.0

In [44]:
# **Rank Transactions by High-Value Customers**

from pyspark.sql.functions import dense_rank
window_spec = Window.partitionBy("Customer_Id").orderBy(df["Total_Amount"].desc())
df = df.withColumn("Transaction_Rank", dense_rank().over(window_spec))

df.show(10)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|Moving_Avg_Sales|Transaction_Rank|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|           150.0|               1|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|          1000.0|               1|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|   

In [45]:
### **1. Window Functions for Advanced Analytics**

from pyspark.sql.window import Window
from pyspark.sql.functions import avg

window_spec = Window.partitionBy("Customer_Id").orderBy("Quantity").rangeBetween(-30, 0)  # Last 30 days rolling avg
df = df.withColumn("Moving_Avg_Sales", avg("Total_Amount").over(window_spec))

df.show(10)

+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+----------------+
|Transaction_Id|      Date|Customer_Id|Gender|Age|Product_Category|Quantity|Price_Per_unit|Total_Amount|Year|Month|Day|     Age_Group|Price_Normalized|Moving_Avg_Sales|Transaction_Rank|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+----+-----+---+--------------+----------------+----------------+----------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|2023|   11| 24|Yougnger_Adult|            0.05|           150.0|               1|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|2023|    2| 27|Yougnger_Adult|             1.0|          1000.0|               1|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|   

In [47]:
# Top-Selling Product Categories

from pyspark.sql.functions import sum, col, avg, count, countDistinct

df_grouped = df.groupBy("Product_Category", "Age_Group").agg(
    sum(col("Total_Amount").cast("float")).alias("Total_Revenue"),
    avg(col("Price_Per_unit")).alias("Avg_Price"),
    countDistinct(col("Customer_Id")).alias("Unique_Customers"),
    count(col("Transaction_Id")).alias("Transaction_Count")
).orderBy(col("Total_Revenue").desc())

df_grouped.show()

+----------------+--------------+-------------+------------------+----------------+-----------------+
|Product_Category|     Age_Group|Total_Revenue|         Avg_Price|Unique_Customers|Transaction_Count|
+----------------+--------------+-------------+------------------+----------------+-----------------+
|        Clothing|Yougnger_Adult|      73105.0|184.18367346938774|             147|              147|
|          Beauty|Yougnger_Adult|      69210.0|198.64963503649636|             137|              137|
|     Electronics|   Middle_Aged|      62400.0| 172.5174825174825|             143|              143|
|        Clothing|   Middle_Aged|      60530.0|175.68627450980392|             153|              153|
|     Electronics|Yougnger_Adult|      60495.0|             172.0|             140|              140|
|          Beauty|   Middle_Aged|      59360.0| 182.8861788617886|             123|              123|
|     Electronics|        Senior|      24065.0| 221.3953488372093|              43

In [48]:
df.printSchema()

root
 |-- Transaction_Id: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Customer_Id: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price_Per_unit: integer (nullable = true)
 |-- Total_Amount: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Age_Group: string (nullable = false)
 |-- Price_Normalized: double (nullable = true)
 |-- Moving_Avg_Sales: double (nullable = true)
 |-- Transaction_Rank: integer (nullable = false)



In [52]:
from pyspark.sql.functions import col , round

df_grouped = df_grouped.withColumn("Avg_Price", round(col("Avg_Price"), 2))

df_grouped.show()


+----------------+--------------+-------------+---------+----------------+-----------------+
|Product_Category|     Age_Group|Total_Revenue|Avg_Price|Unique_Customers|Transaction_Count|
+----------------+--------------+-------------+---------+----------------+-----------------+
|        Clothing|Yougnger_Adult|      73105.0|   184.18|             147|              147|
|          Beauty|Yougnger_Adult|      69210.0|   198.65|             137|              137|
|     Electronics|   Middle_Aged|      62400.0|   172.52|             143|              143|
|        Clothing|   Middle_Aged|      60530.0|   175.69|             153|              153|
|     Electronics|Yougnger_Adult|      60495.0|    172.0|             140|              140|
|          Beauty|   Middle_Aged|      59360.0|   182.89|             123|              123|
|     Electronics|        Senior|      24065.0|    221.4|              43|               43|
|        Clothing|        Senior|      15130.0|   120.37|             

In [53]:
spark.stop()