In [80]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [81]:
spark = SparkSession.builder.appName('sparkApp').config("spark.python.worker.timeout", "120") .getOrCreate()

In [82]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")
df.show()

+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|Transaction_ID|Customer_ID|               City|  Country|Age|Gender|Income|Customer_Segment|      Date|Year|    Month|    Time|Total_Purchases|   Amount|Total_Amount|Product_Category|Product_Type|Shipping_Method|Payment_Method|Order_Status|Ratings|
+--------------+-----------+-------------------+---------+---+------+------+----------------+----------+----+---------+--------+---------------+---------+------------+----------------+------------+---------------+--------------+------------+-------+
|       1000043|      91680|         Fort Worth|      USA| 19|  Male|   Low|             New|2023-11-23|2023| November| 8:23:26|             10|285.67474|   2856.7476|     Electronics|  Smartphone|       Same-Day|        PayPal|   Delivered|      4|


In [83]:
df = df.groupBy("Product_Category", "Date").agg(F.sum(F.col('Total_Purchases')).alias("Total_Purchases")).orderBy("Date", "Product_Category")
df.show(50)

+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|           Books|2001-01-24|            747|
|        Clothing|2001-01-24|            732|
|     Electronics|2001-01-24|           1062|
|         Grocery|2001-01-24|            882|
|      Home Decor|2001-01-24|            774|
|           Books|2001-02-24|            954|
|        Clothing|2001-02-24|            733|
|     Electronics|2001-02-24|            982|
|         Grocery|2001-02-24|            821|
|      Home Decor|2001-02-24|            651|
|           Books|2001-03-24|            840|
|        Clothing|2001-03-24|            838|
|     Electronics|2001-03-24|            868|
|         Grocery|2001-03-24|            954|
|      Home Decor|2001-03-24|            897|
|           Books|2001-04-24|            775|
|        Clothing|2001-04-24|            743|
|     Electronics|2001-04-24|            949|
|         Grocery|2001-04-24|     

In [84]:
df.createOrReplaceTempView("sales_data")

In [None]:
min_max_dates = spark.sql("""
    SELECT 
        Product_Category, 
        MIN(Date) AS min_date, 
        MAX(Date) AS max_date 
    FROM sales_data 
    GROUP BY Product_Category
""")
min_max_dates.createOrReplaceTempView("min_max_dates")


date_series = spark.sql("""
    SELECT 
        Product_Category, 
        date_add(min_date, idx) AS Date
    FROM (
        SELECT 
            Product_Category, 
            min_date, 
            max_date, 
            posexplode(
                split(space(datediff(max_date, min_date)), ' ')
            ) AS (idx, _)
        FROM min_max_dates
    )
""")
date_series.createOrReplaceTempView("date_series")


df_filled = spark.sql("""
    SELECT 
        ds.Product_Category, 
        ds.Date, 
        COALESCE(sd.Total_Purchases, 0) AS Total_Purchases
    FROM date_series ds
    LEFT JOIN sales_data sd
    ON ds.Product_Category = sd.Product_Category AND ds.Date = sd.Date
""")
df_filled.createOrReplaceTempView("filled_data")


df_interpolated = spark.sql("""
    SELECT 
        Product_Category, 
        Date, 
        Total_Purchases,
        COALESCE(
            Total_Purchases,
            AVG(Total_Purchases) OVER (
                PARTITION BY Product_Category 
                ORDER BY Date 
                ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
            )
        ) AS Interpolated_Purchases
    FROM filled_data
""")

df_interpolated.show(60)


+----------------+----------+---------------+----------------------+
|Product_Category|      Date|Total_Purchases|Interpolated_Purchases|
+----------------+----------+---------------+----------------------+
|         Grocery|2001-01-24|            882|                 882.0|
|         Grocery|2001-01-25|              0|                   0.0|
|         Grocery|2001-01-26|              0|                   0.0|
|         Grocery|2001-01-27|              0|                   0.0|
|         Grocery|2001-01-28|              0|                   0.0|
|         Grocery|2001-01-29|              0|                   0.0|
|         Grocery|2001-01-30|              0|                   0.0|
|         Grocery|2001-01-31|              0|                   0.0|
|         Grocery|2001-02-01|              0|                   0.0|
|         Grocery|2001-02-02|              0|                   0.0|
|         Grocery|2001-02-03|              0|                   0.0|
|         Grocery|2001-02-04|     