In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F
from pyspark import StorageLevel

In [63]:
spark = SparkSession.builder.appName('retailRadar').getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [64]:
schema = StructType([
    StructField("Transaction_ID", StringType(), False),
    StructField("Customer_ID", StringType(), False),
    StructField("Name", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Phone", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Zipcode", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Income", StringType(), True),
    StructField("Customer_Segment", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Month", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Total_Purchases", IntegerType(), True),
    StructField("Amount", FloatType(), True),
    StructField("Total_Amount", FloatType(), True),
    StructField("Product_Category", StringType(), True),
    StructField("Product_Brand", StringType(), True),
    StructField("Product_Type", StringType(), True),
    StructField("Feedback", StringType(), True),
    StructField("Shipping_Method", StringType(), True),
    StructField("Payment_Method", StringType(), True),
    StructField("Order_Status", StringType(), True),
    StructField("Ratings", IntegerType(), True),
    StructField("products", StringType(), True)
])

In [65]:
rawDf = spark.read.format("csv").schema(schema).option("header", True).option("inferSchema", False).option("mode", "FAILFAST").option("nullValue", "NA").load('../../data/raw/retail_data.csv')
rawDf.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[Transaction_ID: string, Customer_ID: string, Name: string, Email: string, Phone: string, Address: string, City: string, State: string, Zipcode: string, Country: string, Age: int, Gender: string, Income: string, Customer_Segment: string, Date: string, Year: int, Month: string, Time: string, Total_Purchases: int, Amount: float, Total_Amount: float, Product_Category: string, Product_Brand: string, Product_Type: string, Feedback: string, Shipping_Method: string, Payment_Method: string, Order_Status: string, Ratings: int, products: string]

In [66]:
rawDf.count()

302010

In [67]:
df = rawDf.dropna()
df = df.dropDuplicates(['Transaction_ID'])
df = df.withColumn(
    "Date",
    F.coalesce(
        F.to_date(F.col("Date"), "M/d/yyyy"),   
        F.to_date(F.col("Date"), "MM/dd/yyyy"), 
        F.to_date(F.col("Date"), "yyyy-MM-dd"),
        F.to_date(F.col("Date"), "MM-dd-yy")
    )
)
df = df.withColumn(
    "Date",
    F.when(F.year(F.col("Date")) < 1000,  
           F.expr("add_months(Date, 12 * 2000)")) 
    .otherwise(F.col("Date"))
)

In [68]:
df.count()

287005