In [1]:
import findspark

findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
 
import pandas as pd
import numpy as np
import math
from datetime import timedelta
    
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

In [7]:
inventory_flagged  =spark.read.option('header','true').csv('Dataset//inventory_flagged_data.csv',inferSchema=True)

# Step 2: Generate Forecast
### Define Function to Generate Forecast for a Store-SKU

In [8]:
alpha_value = 0.8 # smoothing factor
 
# function to generate a forecast for a store-sku
def get_forecast(keys, inventory_pd: pd.DataFrame) -> pd.DataFrame:
  
    # identify store and sku
    store_id = keys[0]
    sku = keys[1]

    # identify date range for predictions
    history_start = inventory_pd['date'].min()
    history_end = inventory_pd['date'].max()

    # organize data for model training
    timeseries = (
    inventory_pd
      .set_index('date', drop=True, append=False) # move date to index
      .sort_index() # sort on date-index
    )['total_sales_units'] # just need this one field

    # fit model to timeseries
    model = SimpleExpSmoothing(timeseries, initialization_method='heuristic').fit(smoothing_level=alpha_value)

    # predict sales across historical period
    predictions = model.predict(start=history_start, end=history_end)

    # convert timeseries to dataframe for return
    predictions_pd = predictions.to_frame(name='predicted_sales_units').reset_index() # convert to df
    predictions_pd.rename(columns={'index':'date'}, inplace=True) # rename 'index' column to 'date'
    predictions_pd['store_id'] = store_id # assign store id
    predictions_pd['sku'] = sku # assign sku

    return predictions_pd[['date', 'store_id', 'sku', 'predicted_sales_units']]
 


In [9]:
# structure of forecast function output
forecast_schema = StructType([
  StructField('date', DateType()), 
  StructField('store_id', IntegerType()), 
  StructField('sku', IntegerType()), 
  StructField('predicted_sales_units', FloatType())
  ])

### Generate Forecasts for All Store-SKUs

In [10]:

# get forecasted values for each store-sku combination

forecast = (
  inventory_flagged
    .groupby(['store_id','sku'])
      .applyInPandas(
        get_forecast, 
        schema=forecast_schema
        )
    .withColumn('predicted_sales_units', f.expr('ROUND(predicted_sales_units,0)')) # round values to nearest integer
    )
 


In [11]:
forecast.show()

+----------+--------+---+---------------------+
|      date|store_id|sku|predicted_sales_units|
+----------+--------+---+---------------------+
|2019-01-01|      63| 57|                  0.0|
|2019-01-02|      63| 57|                  0.0|
|2019-01-03|      63| 57|                  0.0|
|2019-01-04|      63| 57|                  0.0|
|2019-01-05|      63| 57|                  0.0|
|2019-01-06|      63| 57|                  2.0|
|2019-01-07|      63| 57|                  2.0|
|2019-01-08|      63| 57|                  0.0|
|2019-01-09|      63| 57|                  0.0|
|2019-01-10|      63| 57|                  2.0|
|2019-01-11|      63| 57|                  2.0|
|2019-01-12|      63| 57|                  0.0|
|2019-01-13|      63| 57|                  0.0|
|2019-01-14|      63| 57|                  0.0|
|2019-01-15|      63| 57|                  0.0|
|2019-01-16|      63| 57|                  0.0|
|2019-01-17|      63| 57|                  0.0|
|2019-01-18|      63| 57|               

In [12]:
# forecast.toPandas().to_csv("Dataset//inventory_forecast.csv", index=False)

# Step 3: Identify Off Sales Issues

### Flag Off-Sales Events

In [15]:
inventory_forecast = spark.read.option('header','true').csv('Dataset//inventory_forecast.csv', inferSchema=True)

In [20]:
inventory_forecast.show()

+-------------------+--------+---+---------------------+
|               date|store_id|sku|predicted_sales_units|
+-------------------+--------+---+---------------------+
|2019-01-01 00:00:00|      63| 57|                  0.0|
|2019-01-02 00:00:00|      63| 57|                  0.0|
|2019-01-03 00:00:00|      63| 57|                  0.0|
|2019-01-04 00:00:00|      63| 57|                  0.0|
|2019-01-05 00:00:00|      63| 57|                  0.0|
|2019-01-06 00:00:00|      63| 57|                  2.0|
|2019-01-07 00:00:00|      63| 57|                  2.0|
|2019-01-08 00:00:00|      63| 57|                  0.0|
|2019-01-09 00:00:00|      63| 57|                  0.0|
|2019-01-10 00:00:00|      63| 57|                  2.0|
|2019-01-11 00:00:00|      63| 57|                  2.0|
|2019-01-12 00:00:00|      63| 57|                  0.0|
|2019-01-13 00:00:00|      63| 57|                  0.0|
|2019-01-14 00:00:00|      63| 57|                  0.0|
|2019-01-15 00:00:00|      63| 

In [21]:
from pyspark.sql.functions import col, to_date

inventory_forecast = inventory_forecast.withColumn('date', to_date(col('Date')))

In [22]:
inventory_forecast.show()

+----------+--------+---+---------------------+
|      date|store_id|sku|predicted_sales_units|
+----------+--------+---+---------------------+
|2019-01-01|      63| 57|                  0.0|
|2019-01-02|      63| 57|                  0.0|
|2019-01-03|      63| 57|                  0.0|
|2019-01-04|      63| 57|                  0.0|
|2019-01-05|      63| 57|                  0.0|
|2019-01-06|      63| 57|                  2.0|
|2019-01-07|      63| 57|                  2.0|
|2019-01-08|      63| 57|                  0.0|
|2019-01-09|      63| 57|                  0.0|
|2019-01-10|      63| 57|                  2.0|
|2019-01-11|      63| 57|                  2.0|
|2019-01-12|      63| 57|                  0.0|
|2019-01-13|      63| 57|                  0.0|
|2019-01-14|      63| 57|                  0.0|
|2019-01-15|      63| 57|                  0.0|
|2019-01-16|      63| 57|                  0.0|
|2019-01-17|      63| 57|                  0.0|
|2019-01-18|      63| 57|               

In [23]:

osa_flag_output = (
  
  inventory_flagged.alias('inv')
    .join(inventory_forecast.alias('for'), on=['store_id','sku','date'], how='leftouter')
    .selectExpr(
      'inv.*',
      'for.predicted_sales_units'
      )
             
    # calculating difference between forecasted and actual sales units
    .withColumn('units_difference', f.expr('predicted_sales_units - total_sales_units'))
    .withColumn('units_difference', f.expr('COALESCE(units_difference, 0)'))
)

In [24]:

 
osa_flag_output = (
  
    osa_flag_output
    # check whether deviation has been increasing over past 4 days
    .withColumn('osa_alert_inc_deviation', f.expr('''
      CASE 
        WHEN units_difference > LAG(units_difference, 1) OVER(PARTITION BY store_id, sku ORDER BY date) AND 
             LAG(units_difference, 1) OVER(PARTITION BY store_id, sku ORDER BY date) > LAG(units_difference, 2) OVER(PARTITION BY store_id, sku ORDER BY date) AND 
             LAG(units_difference, 2) OVER(PARTITION BY store_id, sku ORDER BY date) > LAG(units_difference, 3) OVER(PARTITION BY store_id, sku ORDER BY date)
             THEN 1
        ELSE 0 
        END'''))
    .withColumn('osa_alert_inc_deviation', f.expr('COALESCE(osa_alert_inc_deviation, 0)'))
 
    # rolling 4 day average of sales units
    .withColumn('sales_4day_avg', f.expr('AVG(total_sales_units) OVER(PARTITION BY store_id, sku ORDER BY date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)'))
 
    # rolling 4 day average of forecasted units
    .withColumn('predictions_4day_avg', f.expr('AVG(predicted_sales_units) OVER(PARTITION BY store_id, sku ORDER BY date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)'))
 
    # calculating deviation in rolling average of sales and forecast units
    .withColumn('deviation', f.expr('(predictions_4day_avg - sales_4day_avg) / (predictions_4day_avg+1)'))
    .withColumn('deviation', f.expr('COALESCE(deviation, 0)'))
    
)

In [25]:
 
osa_flag_output = (
  
    osa_flag_output
# Considering 20% deviation as the threshold for OSA flag
    .withColumn('off_sales_alert', f.expr('''
      CASE 
        WHEN deviation > 0.20  AND osa_alert_inc_deviation = 1 THEN 1
        ELSE 0
        END'''))
 
    .select('date', 
            'store_id', 
            'sku', 
            'predicted_sales_units', 
            'off_sales_alert',
            'oos_alert', 
            'zero_sales_flag', 
            'phantom_inventory', 
            'phantom_inventory_ind')
    )
 

In [26]:
osa_flag_output.show()

+-------------------+--------+---+---------------------+---------------+---------+---------------+-----------------+---------------------+
|               date|store_id|sku|predicted_sales_units|off_sales_alert|oos_alert|zero_sales_flag|phantom_inventory|phantom_inventory_ind|
+-------------------+--------+---+---------------------+---------------+---------+---------------+-----------------+---------------------+
|2019-01-01 00:00:00|      63| 57|                  0.0|              0|        0|              0|             null|                    0|
|2019-01-02 00:00:00|      63| 57|                  0.0|              0|        0|              0|              0.0|                    0|
|2019-01-03 00:00:00|      63| 57|                  0.0|              0|        0|              0|              0.0|                    0|
|2019-01-04 00:00:00|      63| 57|                  0.0|              0|        0|              0|              1.0|                    1|
|2019-01-05 00:00:00|      

In [27]:
from pyspark.sql.functions import col, to_date

osa_flag_output = osa_flag_output.withColumn('date', to_date(col('Date')))

In [28]:
osa_flag_output.show()

+----------+--------+---+---------------------+---------------+---------+---------------+-----------------+---------------------+
|      date|store_id|sku|predicted_sales_units|off_sales_alert|oos_alert|zero_sales_flag|phantom_inventory|phantom_inventory_ind|
+----------+--------+---+---------------------+---------------+---------+---------------+-----------------+---------------------+
|2019-01-01|      63| 57|                  0.0|              0|        0|              0|             null|                    0|
|2019-01-02|      63| 57|                  0.0|              0|        0|              0|              0.0|                    0|
|2019-01-03|      63| 57|                  0.0|              0|        0|              0|              0.0|                    0|
|2019-01-04|      63| 57|                  0.0|              0|        0|              0|              1.0|                    1|
|2019-01-05|      63| 57|                  0.0|              0|        0|              0| 

In [29]:
osa_flag_output.toPandas().to_csv("Dataset//osa_flag_outputt.csv", index=False)