## Dynamic Inventory and Alert System

### Dataset Used: M5 Forecasting - Accuracy

#### Files
- calendar.csv - Contains information about the dates on which the products are sold.
- sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
- sample_submission.csv - The correct format for submissions. Reference the Evaluation tab for more info.
- sell_prices.csv - Contains information about the price of the products sold per store and date.
- sales_train_evaluation.csv - Includes sales [d_1 - d_1941] (labels used for the Public leaderboard)


In [0]:
%sql
SHOW TABLES IN workspace.database;

In [0]:
calendar_df=spark.read.table("workspace.database.calendar")
sales_train_df=spark.read.table("workspace.database.sales_train_evaluation")
sales_val_df=spark.read.table("workspace.database.sales_train_validation")
sell_prices_df=spark.read.table("workspace.database.sell_prices")


In [0]:
display(sales_val_df.limit(5))

In [0]:
display(sell_prices_df.limit(5))

In [0]:
display(calendar_df.columns)

Importing necessary libraries

In [0]:
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from functools import reduce

Working on 3 stores 2 categories and 2 store in 1 state for 3 years

### Starting with calendar.csv 

In [0]:
display(calendar_df.select("year").distinct())

In [0]:
#Filtering the data to just 3 years
calendar_df2=calendar_df.filter(col("year").isin([2011,2012,2013]))
display(calendar_df2.select("year").distinct())

In [0]:
#Dropping unnecessary columns
calendar_df2=calendar_df2.drop("snap_TX","snap_WI")
display(calendar_df2.columns)

In [0]:
null_check=calendar_df2.filter(
    reduce(lambda a,b: a|b,[col(c).isNull() for c in calendar_df2.columns])
)
display(calendar_df2.select(["event_type_1","event_name_2","event_type_2"]).distinct())

In [0]:
#Calculating list of days from 2011-2013
Days_2011_2023=[row["d"] for row in calendar_df2.select("d").collect()]
print(f"Number of days:{len(Days_2011_2023)}")

In [0]:
display(calendar_df2.limit(5))

###Working with sell_prices.csv

In [0]:
sell_prices_df.columns


In [0]:
display(sell_prices_df.select("store_id").distinct())

In [0]:
#Working with only 3 stores in california
sell_prices_df2=sell_prices_df.filter(col("store_id").isin(["CA_1","CA_2","CA_3"]))
display(sell_prices_df2.select("store_id").distinct())

In [0]:
#Checking for the categories now
display(sell_prices_df2.select("item_id").distinct())

In [0]:
#Now again breaking into only 2 categories
sell_prices_df2=sell_prices_df2.filter(~col("item_id").like("FOODS%"))
display(sell_prices_df2.limit(10))

In [0]:
len(sales_train_df.columns)


In [0]:
display(sales_train_df.select('id',
 'item_id',
 'dept_id',
 'cat_id',
 'store_id',
 'state_id'))

In [0]:
display(sales_train_df.select("state_id").distinct())

In [0]:
sales_train_df2=sales_train_df.filter(col("store_id").isin(["CA_1","CA_2","CA_3"]))
display(sales_train_df2.select("store_id").distinct())

In [0]:
sales_train_df2=sales_train_df2.filter(~col("cat_id").isin("FOODS"))
display(sales_train_df2.select("cat_id").distinct())


In [0]:
display(sales_train_df2.select("cat_id").distinct())

In [0]:
display(sales_val_df.limit(20))

In [0]:
display(sales_val_df.select("store_id").distinct())

In [0]:
sales_val_df2=sales_val_df.filter(col("store_id").isin(["CA_1","CA_2","CA_3"]))
display(sales_val_df2.select("store_id").distinct())

In [0]:
sales_val_df2=sales_val_df2.filter(~col("cat_id").isin("FOODS"))
display(sales_val_df2.select("cat_id").distinct())

In [0]:
display(sales_train_df2.select("cat_id","store_id","state_id").distinct())

In [0]:
#Filtering the days to just 3 years
all_columns=sales_train_df2.columns

#Days of 3 years
days_to_keep=[f"d_{i}" for i in range(1,1069)]

#ID columns
id_columns=["id","item_id","dept_id","cat_id","store_id","state_id"]

#Filtering the days
columns_to_select=id_columns+days_to_keep
sales_train_filtered=sales_train_df2.select(columns_to_select)
display(len(sales_train_filtered.columns))

In [0]:
#Melting the columns into rows
#Creating stack expression

stack_expression=", ".join([f"'{day}',{day}" for day in days_to_keep])

sales_train_filtered2=sales_train_filtered.selectExpr(
    *id_columns,
    f"stack({len(days_to_keep)},{stack_expression}) as (day_col,sales)"
)
display(sales_train_filtered2)

In [0]:
display(sales_train_filtered2.select("day_col").distinct().count())

In [0]:
merged_df=sales_train_filtered2.join(
    calendar_df2.select('d','date','wm_yr_wk','event_name_1','event_type_1','event_name_2','event_type_2','snap_CA'),
    sales_train_filtered2.day_col==calendar_df2.d,
    how='left'
).drop("d")


In [0]:
display(merged_df.limit(5))

In [0]:
#Final dataset
daily_sales_df=merged_df.join(
    sell_prices_df2.select("store_id","item_id","wm_yr_wk","sell_price"),
    on=["store_id","item_id","wm_yr_wk"],
    how='left'
)
display(daily_sales_df.limit(10))

In [0]:
daily_sales_df.dtypes

In [0]:
daily_sales_df.printSchema()

In [0]:
#Checking for null values
daily_sales_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in daily_sales_df.columns]).display()


###we have to work on null values

In [0]:
daily_sales_df.describe(["sales","sell_price"]).display()

In [0]:
#Category Level Analysis
daily_sales_df.groupBy("cat_id").agg(F.sum("sales").alias("total_sales")).display()

In [0]:
#Store and State-level Analysis
daily_sales_df.groupBy("store_id").agg(F.sum("sales").alias("total_sales")).display()