#04_analytics_insights – Demand Forecasting

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    to_date, year, month, dayofmonth, last_day,
    sum as spark_sum, col, lag, avg, dayofweek, when
)
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("DailySalesWithLags").getOrCreate()


# 1) Load silver data and aggregate to monthly quantity per item


In [0]:
silver = spark.read.format("delta").load("dbfs:/FileStore/silver/online_retail")
daily_sales = (
    silver
      .withColumn("date", to_date("InvoiceDate"))
      .withColumn("dom", dayofmonth("InvoiceDate"))
      .withColumn("last_dom", dayofmonth(last_day("InvoiceDate")))
      .withColumn("month", month("InvoiceDate"))
      .groupBy("StockCode","date","dom","last_dom","month")
      .agg(spark_sum("Quantity").alias("daily_quantity"))
)

# 2) Define a window partitioned by item, ordered by date


In [0]:
w = Window.partitionBy("StockCode").orderBy("date")


# 3) Add lag & rolling features + weekend flag


In [0]:
fe = (
    daily_sales
      .withColumn("lag_1",  lag("daily_quantity", 1).over(w))
      .withColumn("lag_7",  lag("daily_quantity", 7).over(w))
      .withColumn("ma_7",   avg("daily_quantity").over(w.rowsBetween(-7, -1)))
      .withColumn(
          "is_weekend",
          when(dayofweek("date").isin([1,7]), 1).otherwise(0)
      )
      .na.fill({"lag_1":0, "lag_7":0, "ma_7":0})
)


# 4) Train/test: everything NOT in last 7 days vs last 7 days


In [0]:
train = fe.filter(col("dom") <= col("last_dom") - 7)
test  = fe.filter(col("dom") >  col("last_dom") - 7)

# 5) Build feature pipeline including new vars


In [0]:
indexer = StringIndexer(inputCol="StockCode", outputCol="item_index", handleInvalid="keep")
encoder  = OneHotEncoder(inputCol="item_index", outputCol="item_vec", handleInvalid="keep")
assembler= VectorAssembler(
    inputCols=["item_vec","month","dom","lag_1","lag_7","ma_7","is_weekend"],
    outputCol="features"
)
rf = RandomForestRegressor(
    featuresCol="features", labelCol="daily_quantity",
    numTrees=50, maxDepth=8
)
pipeline = Pipeline(stages=[indexer, encoder, assembler, rf])

# 6) Fit & predict

In [0]:
model = pipeline.fit(train)
predictions = model.transform(test)

#7) Evaluate


In [0]:
rmse = RegressionEvaluator(
    labelCol="daily_quantity",
    predictionCol="prediction",
    metricName="rmse"
).evaluate(predictions)

print(f"Test RMSE with lags & rolling: {rmse:.2f}")


Test RMSE with lags & rolling: 86.77


# 8) Sample output

In [0]:
predictions.select(
    "StockCode","date","daily_quantity","prediction","lag_1","lag_7","ma_7","is_weekend"
).display()

StockCode,date,daily_quantity,prediction,lag_1,lag_7,ma_7,is_weekend
10123C,2011-03-31,1,10.684824814604344,3,0,2.0,0
10133,2011-01-25,10,31.201576872905488,40,50,24.285714285714285,0
10133,2011-02-24,10,21.470178115666663,4,20,16.285714285714285,0
10133,2011-02-28,3,20.58398442091554,10,20,14.857142857142858,0
10133,2011-03-29,30,14.454675021060082,2,10,11.428571428571429,0
10133,2011-06-24,22,32.40026341020989,22,50,29.571428571428573,0
10133,2011-06-26,2,23.67059262737192,22,14,25.571428571428573,1
10133,2011-06-28,24,25.08850096265446,2,40,23.857142857142858,0
10133,2011-07-26,20,53.20833858235448,100,40,52.71428571428572,0
10133,2011-07-27,27,42.680720544740495,20,60,49.85714285714285,0


# 9) View sample predictions


In [0]:
predictions.select(
    "StockCode","date","daily_quantity","prediction","lag_1","lag_7","ma_7","is_weekend"
).show(10, False)

+---------+----------+--------------+------------------+-----+-----+------------------+----------+
|StockCode|date      |daily_quantity|prediction        |lag_1|lag_7|ma_7              |is_weekend|
+---------+----------+--------------+------------------+-----+-----+------------------+----------+
|10123C   |2011-03-31|1             |10.684824814604344|3    |0    |2.0               |0         |
|10133    |2011-01-25|10            |31.201576872905488|40   |50   |24.285714285714285|0         |
|10133    |2011-02-24|10            |21.470178115666663|4    |20   |16.285714285714285|0         |
|10133    |2011-02-28|3             |20.58398442091554 |10   |20   |14.857142857142858|0         |
|10133    |2011-03-29|30            |14.454675021060082|2    |10   |11.428571428571429|0         |
|10133    |2011-06-24|22            |32.40026341020989 |22   |50   |29.571428571428573|0         |
|10133    |2011-06-26|2             |23.670592627371917|22   |14   |25.571428571428573|1         |
|10133    