In [9]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [89]:
spark = SparkSession.builder.appName('sparkApp').config("spark.python.worker.timeout", "120") .getOrCreate()

In [91]:
df = spark.read.parquet("../../data/processed/cleanedData.parquet")
unique_values_df = df.select("Product_Category").distinct()
unique_values_df.show()

+----------------+
|Product_Category|
+----------------+
|         Grocery|
|     Electronics|
|        Clothing|
|           Books|
|      Home Decor|
+----------------+



In [21]:
df = df.groupBy("Product_Category", "Date").agg(F.sum(F.col('Total_Purchases')).alias("Total_Purchases")).orderBy("Date", "Product_Category")
df.show(50)

+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|           Books|2001-01-24|            747|
|        Clothing|2001-01-24|            732|
|     Electronics|2001-01-24|           1062|
|         Grocery|2001-01-24|            882|
|      Home Decor|2001-01-24|            774|
|           Books|2001-02-24|            954|
|        Clothing|2001-02-24|            733|
|     Electronics|2001-02-24|            982|
|         Grocery|2001-02-24|            821|
|      Home Decor|2001-02-24|            651|
|           Books|2001-03-24|            840|
|        Clothing|2001-03-24|            838|
|     Electronics|2001-03-24|            868|
|         Grocery|2001-03-24|            954|
|      Home Decor|2001-03-24|            897|
|           Books|2001-04-24|            775|
|        Clothing|2001-04-24|            743|
|     Electronics|2001-04-24|            949|
|         Grocery|2001-04-24|     

In [56]:
df.createOrReplaceTempView("sales_data")

In [86]:
minMaxDates = spark.sql("""
    SELECT 
        Product_Category, 
        MIN(Date) AS min_date, 
        MAX(Date) AS max_date 
    FROM sales_data 
    GROUP BY Product_Category
""")
minMaxDates.createOrReplaceTempView("minMaxDates")


dateSeries = spark.sql("""
    SELECT 
        Product_Category, 
        date_add(min_date, idx) AS Date
    FROM (
        SELECT 
            Product_Category, 
            min_date, 
            max_date, 
            posexplode(
                split(space(datediff(max_date, min_date)), ' ')
            ) AS (idx, _)
        FROM minMaxDates
    )
""")
dateSeries.createOrReplaceTempView("dateSeries")


dfFilled = spark.sql("""
    SELECT 
        ds.Product_Category, 
        ds.Date, 
        COALESCE(sd.Total_Purchases, 0) AS Total_Purchases
    FROM dateSeries ds
    LEFT JOIN sales_data sd
    ON ds.Product_Category = sd.Product_Category AND ds.Date = sd.Date
""")
dfFilled.createOrReplaceTempView("filled_data")


dfFilled.show()


+----------------+----------+---------------+
|Product_Category|      Date|Total_Purchases|
+----------------+----------+---------------+
|         Grocery|2001-01-24|            882|
|         Grocery|2001-01-25|              0|
|         Grocery|2001-01-26|              0|
|         Grocery|2001-01-27|              0|
|         Grocery|2001-01-28|              0|
|         Grocery|2001-01-29|              0|
|         Grocery|2001-01-30|              0|
|         Grocery|2001-01-31|              0|
|         Grocery|2001-02-01|              0|
|         Grocery|2001-02-02|              0|
|         Grocery|2001-02-03|              0|
|         Grocery|2001-02-04|              0|
|         Grocery|2001-02-05|              0|
|         Grocery|2001-02-06|              0|
|         Grocery|2001-02-07|              0|
|         Grocery|2001-02-08|              0|
|         Grocery|2001-02-09|              0|
|         Grocery|2001-02-10|              0|
|         Grocery|2001-02-11|     

In [81]:
import pandas as pd
from prophet import Prophet

In [82]:
def trainProphetModel(dfFilled):
    dfFilled['Date'] = pd.to_datetime(dfFilled['Date'])
    models = {}
    for category in dfFilled['Product_Category'].unique():
        categoryData = dfFilled[dfFilled['Product_Category'] == category]
        prophetData = categoryData[['Date', 'Total_Purchases']].rename(columns={'Date': 'ds', 'Total_Purchases': 'y'})
        model = Prophet()
        model.fit(prophetData)
        models[category] = model
    return models

In [83]:
def predictNext30Days(models, dfFilled):
    dfFilled['Date'] = pd.to_datetime(dfFilled['Date'])
    predictions = {}
    for category, model in models.items():
        categoryData = dfFilled[dfFilled['Product_Category'] == category]
        prophetData = categoryData[['Date', 'Total_Purchases']].rename(columns={'Date': 'ds', 'Total_Purchases': 'y'})
        lastDate = prophetData['ds'].max()
        future = model.make_future_dataframe(periods=30, include_history=False)
        future = future[future['ds'] > lastDate]
        forecast = model.predict(future)
        predictions[category] = forecast[['ds', 'yhat']].assign(Product_Category=category)
    allPredictions = pd.concat(predictions.values())
    totalSales = allPredictions.groupby('Product_Category')['yhat'].sum().reset_index()
    totalSales.columns = ['Product_Category', 'Total_Predicted_Sales']
    return allPredictions, totalSales

In [87]:
dfFilled = dfFilled.toPandas()
models = trainProphetModel(dfFilled)
predictions, totalSales = predictNext30Days(models, dfFilled)

print(totalSales)

11:32:46 - cmdstanpy - INFO - Chain [1] start processing
11:32:47 - cmdstanpy - INFO - Chain [1] done processing
11:32:48 - cmdstanpy - INFO - Chain [1] start processing
11:32:50 - cmdstanpy - INFO - Chain [1] done processing
11:32:51 - cmdstanpy - INFO - Chain [1] start processing
11:32:52 - cmdstanpy - INFO - Chain [1] done processing
11:32:53 - cmdstanpy - INFO - Chain [1] start processing
11:32:54 - cmdstanpy - INFO - Chain [1] done processing
11:32:55 - cmdstanpy - INFO - Chain [1] start processing
11:32:56 - cmdstanpy - INFO - Chain [1] done processing


  Product_Category  Total_Predicted_Sales
0            Books            9325.399123
1         Clothing            9218.462101
2      Electronics           11999.799691
3          Grocery           11003.799034
4       Home Decor            9121.143987
