In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F
from pyspark import StorageLevel

In [3]:
spark = SparkSession.builder.appName('retailRadar').getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [4]:
schema = StructType([
    StructField("Transaction_ID", StringType(), False),
    StructField("Customer_ID", StringType(), False),
    StructField("Name", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Phone", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Zipcode", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Income", StringType(), True),
    StructField("Customer_Segment", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Month", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Total_Purchases", IntegerType(), True),
    StructField("Amount", FloatType(), True),
    StructField("Total_Amount", FloatType(), True),
    StructField("Product_Category", StringType(), True),
    StructField("Product_Brand", StringType(), True),
    StructField("Product_Type", StringType(), True),
    StructField("Feedback", StringType(), True),
    StructField("Shipping_Method", StringType(), True),
    StructField("Payment_Method", StringType(), True),
    StructField("Order_Status", StringType(), True),
    StructField("Ratings", IntegerType(), True),
    StructField("products", StringType(), True)
])

In [5]:
rawDf = spark.read.format("csv").schema(schema).option("header", True).option("inferSchema", False).option("mode", "FAILFAST").option("nullValue", "NA").load('../../data/raw/retail_data.csv')
rawDf.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[Transaction_ID: string, Customer_ID: string, Name: string, Email: string, Phone: string, Address: string, City: string, State: string, Zipcode: string, Country: string, Age: int, Gender: string, Income: string, Customer_Segment: string, Date: string, Year: int, Month: string, Time: string, Total_Purchases: int, Amount: float, Total_Amount: float, Product_Category: string, Product_Brand: string, Product_Type: string, Feedback: string, Shipping_Method: string, Payment_Method: string, Order_Status: string, Ratings: int, products: string]

In [6]:
df = rawDf.dropna()
df = df.dropDuplicates(['Transaction_ID'])
df = df.withColumn(
    "Date",
    F.coalesce(
        F.to_date(F.col("Date"), "M/d/yyyy"),   
        F.to_date(F.col("Date"), "MM/dd/yyyy"), 
        F.to_date(F.col("Date"), "yyyy-MM-dd"),
        F.to_date(F.col("Date"), "MM-dd-yy")
    )
)
df = df.withColumn(
    "Date",
    F.when(F.year(F.col("Date")) < 1000,  
           F.expr("add_months(Date, 12 * 2000)")) 
    .otherwise(F.col("Date"))
)
df.cache()

DataFrame[Transaction_ID: string, Customer_ID: string, Name: string, Email: string, Phone: string, Address: string, City: string, State: string, Zipcode: string, Country: string, Age: int, Gender: string, Income: string, Customer_Segment: string, Date: date, Year: int, Month: string, Time: string, Total_Purchases: int, Amount: float, Total_Amount: float, Product_Category: string, Product_Brand: string, Product_Type: string, Feedback: string, Shipping_Method: string, Payment_Method: string, Order_Status: string, Ratings: int, products: string]

In [None]:
uniqueCustomers = df.dropDuplicates(['Customer_ID'])
uniqueCustomers.cache()

genderDistribution = uniqueCustomers.groupBy("Gender").count()
genderDistribution.show()

ageGroupDf = uniqueCustomers.withColumn(
    'age_group',
    F.when(df['age'] < 20, 'Under 20')
    .when((df['age'] >= 20) & (df['age'] < 30), '20-29')
    .when((df['age'] >= 30) & (df['age'] < 40), '30-39')
    .when((df['age'] >= 40) & (df['age'] < 50), '40-49')
    .when((df['age'] >= 50) & (df['age'] < 60), '50-59')
    .when(df['age'] >= 60, '60+')
    .otherwise('Unknown')
)
ageGrouped = ageGroupDf.groupBy('age_group').count()
ageGrouped.show()

incomeDistribution = uniqueCustomers.groupBy("Income").count()
incomeDistribution.show()

+------+-----+
|Gender|count|
+------+-----+
|Female|32591|
|  Male|53689|
+------+-----+

+---------+-----+
|age_group|count|
+---------+-----+
|    30-39|12442|
|    20-29|35918|
|      60+| 8025|
|    40-49|16025|
| Under 20| 5698|
|    50-59| 8172|
+---------+-----+

+------+-----+
|Income|count|
+------+-----+
|  High|21412|
|   Low|27529|
|Medium|37339|
+------+-----+



In [10]:
topSegmentsByRevenue = df.groupBy("Customer_Segment").agg(F.sum("Total_Amount").alias("Total_Revenue")).orderBy(F.desc("Total_Revenue"))
topSegmentsByRevenue.show()

+----------------+--------------------+
|Customer_Segment|       Total_Revenue|
+----------------+--------------------+
|         Regular|1.9152795218494225E8|
|             New|1.1851570415899372E8|
|         Premium| 8.265496153085995E7|
+----------------+--------------------+

