In [2]:
import pandas as pd
import numpy as np
from scipy.signal import correlate
import plotly.express as px
import plotly.graph_objects as go
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

In [3]:
# Cases data
cases = pd.read_csv("./cases_sold.csv")

# Inventory data
inv = pd.read_csv("./inventory.csv")
inv['BRNCH_CD'] = inv['BRNCH_CD'].astype('string')
inv['TEMP_ZONE'] = inv['TEMP_ZONE'].astype('string')
inv['TEMP_ZONE'] = inv['TEMP_ZONE'].replace(['CLR', 'DRY', 'FRZ'], ['Refrigerated', 'Dry', 'Freezer'])

# Spoilage data
spoilage = pd.read_csv("./spoilage.csv")
spoilage['TEMP_ZONE'] = spoilage['TEMP_ZONE'].replace(['CLR', 'DRY', 'FRZ'], ['Refrigerated', 'Dry', 'Freezer'])

# Slot utilization data
slot_util = pd.read_csv("./Slot Utilization.csv")
slot_util['DATE_EXTRACT'] = pd.to_datetime(slot_util['DATE_EXTRACT'])

# Remove rows with no capacity and where branches are equal to X1, X6, or X7
slot_util = slot_util[slot_util['CAPACITY'].notna()]
slot_util = slot_util[~slot_util['BRNCH_CD'].isin(['X1', 'X6', 'X7'])]
slot_util = slot_util[~slot_util['FULL_MARKET_NAME'].str.contains('STOCK YARDS')]
slot_util_main = slot_util[slot_util['CAPACITY'] != 0]

In [4]:
np.unique(slot_util['BRNCH_CD'])

array(['2G', '2I', '2J', '2L', '2N', '2O', '2R', '2Z', '3D', '3F', '3J',
       '3K', '3L', '3M', '3V', '3W', '3Y', '3Z', '4C', '4H', '4I', '4J',
       '4O', '4P', '4Q', '4R', '4U', '4V', '5D', '5E', '5G', '5I', '5O',
       '5T', '5Y', '5Z', '6A', '6B', '6D', '6F', '6G', '6H', '6I', '6J',
       '6N', '6U', '6V', '6W', '6Y', '6Z', '8A', '8B', '8E', '8L', '8N',
       '8O', '8S', '8T', '8U', '8V', '9A', '9B', '9D', '9I', '9J', '9L',
       '9O', '9P', '9Q', '9U'], dtype=object)

In [5]:
slot_util.head()

Unnamed: 0,WAREHOUSE_LOCN,AREA,BRNCH_CD,FULL_MARKET_NAME,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY
0,2125,Freezer,5O,"MANASSAS (5O, 2125)",Main Warehouse,Reserve,2022-05-16,9358.0,13044,0.717417
2,2345,Refrigerated,3J,"BISMARCK (3J, 2345)",Main Warehouse,Reserve,2021-10-19,477.0,905,0.527071
3,4135,Freezer,8T,"PHOENIX SYSTEMS (8T, 4135)",Main Warehouse,Reserve,2021-10-08,1330.0,1694,0.785123
4,1106,Dry,8L,"DETROIT (8L, 1106)",Main Warehouse,Pick,2021-05-12,5585.0,6243,0.894601
7,2099,Freezer,3Y,"CHICAGO (3Y, 2099)",Main Warehouse,Pick,2022-01-25,2917.0,3638,0.801814


# What % of capacity is a virtual slot?
### Calculated as the number of pallet positions used in virtual slots over the total number of pallet positions used per day/branch

### Answer is 0.84% if not already run

In [6]:
# Total number of pallet positions used by day and branch 
brnchs = slot_util.groupby(['DATE_EXTRACT', 'FULL_MARKET_NAME']).agg(np.sum).reset_index()
brnchs['CAPACITY'] = brnchs['SUM(PALLET_USED)'] / brnchs['SUM(PALLET_POSITIONS)']
brnchs = brnchs[['DATE_EXTRACT', 'FULL_MARKET_NAME', 'SUM(PALLET_USED)']]
brnchs = brnchs.rename(columns={'SUM(PALLET_USED)':'TOTAL_PALLETS_USED'})

# Number of pallet positions that are virtual slots by day and branch
virt = slot_util.groupby(['DATE_EXTRACT', 'STORAGE_TYPE', 'FULL_MARKET_NAME']).agg(np.sum).reset_index()
virt = virt.merge(brnchs, how='left', on=['DATE_EXTRACT', 'FULL_MARKET_NAME'])
virt = virt[['DATE_EXTRACT', 'STORAGE_TYPE', 'FULL_MARKET_NAME', 'SUM(PALLET_USED)', 'TOTAL_PALLETS_USED']]
virt = virt.rename(columns={'SUM(PALLET_USED)':'VIRTUAL_SLOT_POSITIONS'})
virt = virt[virt['STORAGE_TYPE'] == 'Virtual Slot']
virt['PERC_VIRTUAL_SLOTS'] = virt['VIRTUAL_SLOT_POSITIONS'] / virt['TOTAL_PALLETS_USED']
print(virt.head())
print()
print('Average % virtual slots: ', round(np.mean(virt['PERC_VIRTUAL_SLOTS'])*100, 2), '%', sep='')

    DATE_EXTRACT  STORAGE_TYPE        FULL_MARKET_NAME  \
147   2021-02-14  Virtual Slot       ALBANY (9B, 2140)   
148   2021-02-14  Virtual Slot  ALBUQUERQUE (8V, 1933)   
149   2021-02-14  Virtual Slot    ALLENTOWN (2J, 2110)   
150   2021-02-14  Virtual Slot      ATLANTA (5I, 2220)   
151   2021-02-14  Virtual Slot       AUSTIN (6Z, 3023)   

     VIRTUAL_SLOT_POSITIONS  TOTAL_PALLETS_USED  PERC_VIRTUAL_SLOTS  
147                   790.0             16525.0            0.047806  
148                    10.0             11717.0            0.000853  
149                    72.0             23313.0            0.003088  
150                    20.0             38967.0            0.000513  
151                     7.0             21520.0            0.000325  

Average % virtual slots: 0.49%


In [7]:
px.violin(virt, 'PERC_VIRTUAL_SLOTS')

In [8]:
# Top 10 branches by average percentage of space that's virtual slots (most commonly overfilled warehouses)
overfilled = virt.groupby('FULL_MARKET_NAME')['PERC_VIRTUAL_SLOTS'].agg(np.mean).sort_values(ascending=False)
top10_branches = overfilled[:10]
top10 = virt[virt['FULL_MARKET_NAME'].isin(top10_branches.index)]
px.line(top10, x='DATE_EXTRACT', y='PERC_VIRTUAL_SLOTS', color='FULL_MARKET_NAME')

In [9]:
# Bottom 10 branches by average percentage of space that's virtual slots (least commonly overfilled warehouses)
bottom10_branches = overfilled[-10:]
bottom10 = virt[virt['FULL_MARKET_NAME'].isin(bottom10_branches.index)]
px.line(bottom10, x='DATE_EXTRACT', y='PERC_VIRTUAL_SLOTS', color='FULL_MARKET_NAME')

# Capacity % by warehouse and temp zone

### Note: refrigerated capacity always lower than dry and freezer but capacities for each area are highly correlated (may indicate that it's not particularly valuable to include disaggregated capacity by area in model)

In [10]:
# Average capacity over time by area
temp = slot_util_main.groupby(['DATE_EXTRACT', 'AREA']).agg(np.mean).reset_index()
px.line(temp, 'DATE_EXTRACT', y='CAPACITY', color='AREA')

In [11]:
# Average capacity over time by branch 
temp = slot_util_main.groupby(['DATE_EXTRACT', 'BRNCH_CD']).agg(np.mean).reset_index()
px.line(temp, 'DATE_EXTRACT', y='CAPACITY', color='BRNCH_CD')

In [12]:
# Average capacity over time by branch and area for top 5 branches by number of cases sold
biggest_sellers = cases.groupby('BRNCH_CD')['CASES_SOLD'].aggregate(np.mean).sort_values(ascending=False)
branches = biggest_sellers.index[:3]
temp = slot_util_main.groupby(['DATE_EXTRACT', 'AREA', 'BRNCH_CD']).agg(np.mean).reset_index()
px.line(temp[temp['BRNCH_CD'].isin(np.array(branches))], 'DATE_EXTRACT', y='CAPACITY', facet_col='BRNCH_CD', color='AREA',
        title='Average capacity over time by branch and area for top 3 branches by number of cases sold')

In [13]:
slot_util[slot_util['BRNCH_CD'] == '5T']

Unnamed: 0,WAREHOUSE_LOCN,AREA,BRNCH_CD,FULL_MARKET_NAME,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY
30,3040,Refrigerated,5T,"NEW ORLEANS (5T, 3040)",Main Warehouse,Reserve,2021-11-14,900.0,1989,0.452488
42,3040,Refrigerated,5T,"NEW ORLEANS (5T, 3040)",Main Warehouse,Reserve,2021-07-16,63.0,1966,0.032044
220,3040,Dry,5T,"NEW ORLEANS (5T, 3040)",Main Warehouse,Reserve,2022-09-14,1724.0,6577,0.262125
259,3040,Freezer,5T,"NEW ORLEANS (5T, 3040)",Main Warehouse,Reserve,2022-02-28,2067.0,4818,0.429016
365,3040,Dry,5T,"NEW ORLEANS (5T, 3040)",Main Warehouse,Reserve,2022-11-20,2888.0,6477,0.445885
...,...,...,...,...,...,...,...,...,...,...
1135608,3040,Freezer,5T,"NEW ORLEANS (5T, 3040)",9999,Reserve,2022-03-31,6.0,0,0.000000
1136057,3040,Dry,5T,"NEW ORLEANS (5T, 3040)",9999,Reserve,2022-02-25,23.0,0,0.000000
1136165,3040,Freezer,5T,"NEW ORLEANS (5T, 3040)",9999,Reserve,2022-09-27,14.0,0,0.000000
1137052,3040,Freezer,5T,"NEW ORLEANS (5T, 3040)",9999,Reserve,2022-08-20,3.0,0,0.000000


In [14]:
# Average capacity over time by branch and area for bottom 5 branches by number of cases sold
biggest_sellers = cases.groupby('BRNCH_CD')['CASES_SOLD'].aggregate(np.mean).sort_values(ascending=False)
branches = biggest_sellers.index[-11:]
temp = slot_util_main.groupby(['DATE_EXTRACT', 'AREA', 'BRNCH_CD']).agg(np.mean).reset_index()
px.line(temp[temp['BRNCH_CD'].isin(np.array(branches))], 'DATE_EXTRACT', y='CAPACITY', facet_col='BRNCH_CD', color='AREA', 
        title='Average capacity over time by branch and area for bottom 3 branches by number of cases sold')

# Aggregate data to have one row for each branch/day/area

In [14]:
# Adding fiscal year and week to slot_util data to allow merging with other tables
fw = slot_util['DATE_EXTRACT'].apply(lambda a : int(str(a.year) + str(a.week)))
slot_util['FISC_YR_WK'] = fw

# Merging slot utilization by week and branch
merged = slot_util.merge(cases, how='left', on=['BRNCH_CD', 'FISC_YR_WK'], validate="m:1")
merged['DIV_NBR'] = merged['DIV_NBR'].fillna(0)

# Merge with spoilage data on branch, week, and area
merged = merged.merge(spoilage, how='left', left_on=['BRNCH_CD', 'FISC_YR_WK', 'AREA'], 
                      right_on=['BRNCH_CD', 'FISC_YR_WK', 'TEMP_ZONE'], validate="m:1")

# Merge with inventory data on branch, week, and area
merged = merged.merge(inv, how='left', left_on=['BRNCH_CD', 'FISC_YR_WK', 'AREA'], 
                      right_on=['BRNCH_CD', 'FISC_YR_WK', 'TEMP_ZONE'], validate="m:1")
merged.head()

# Dropping redundant or useless columns
final = merged.drop(['WAREHOUSE_LOCN', 'FULL_MARKET_NAME', 'FISC_YR_WK',
                    'DIV_NBR', 'DIV_NM', 'CASES_IMPACTED', 'LDR_TM', 'TEMP_ZONE_x', 'TEMP_ZONE_y'], axis=1)

In [15]:
print(final.apply(lambda a : np.sum(a.isna())))
final.head()

AREA                          0
BRNCH_CD                      0
STORAGE_TYPE                  0
PICK_TYPE                     0
DATE_EXTRACT                  0
SUM(PALLET_USED)              0
SUM(PALLET_POSITIONS)         0
CAPACITY                      0
CASES_SOLD                84913
SPOILAGE                 335922
MAX_WKLY_INVENTORY        84095
dtype: int64


Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,CASES_SOLD,SPOILAGE,MAX_WKLY_INVENTORY
0,Freezer,5O,Main Warehouse,Reserve,2022-05-16,9358.0,13044,0.717417,329538.154,5.0,437449.375
1,Refrigerated,3J,Main Warehouse,Reserve,2021-10-19,477.0,905,0.527071,44478.6486,,25167.027778
2,Freezer,8T,Main Warehouse,Reserve,2021-10-08,1330.0,1694,0.785123,93214.2753,,100198.833333
3,Dry,8L,Main Warehouse,Pick,2021-05-12,5585.0,6243,0.894601,187122.1838,,352921.831941
4,Freezer,3Y,Main Warehouse,Pick,2022-01-25,2917.0,3638,0.801814,,,


In [16]:
# Filling missing values by column
final = final[final['SPOILAGE'].notna()]
final['CASES_SOLD'] = final['CASES_SOLD'].fillna(0)
final['MAX_WKLY_INVENTORY'] = final['MAX_WKLY_INVENTORY'].fillna(0)
final.head()

Unnamed: 0,AREA,BRNCH_CD,STORAGE_TYPE,PICK_TYPE,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,CASES_SOLD,SPOILAGE,MAX_WKLY_INVENTORY
0,Freezer,5O,Main Warehouse,Reserve,2022-05-16,9358.0,13044,0.717417,329538.154,5.0,437449.375
5,Dry,5E,Main Warehouse,Reserve,2022-04-27,12781.0,19222,0.664915,440745.0181,55.0,667884.433331
9,Refrigerated,3W,Virtual Slot,Pick,2022-04-23,53.0,0,0.0,200327.2934,98.25,106742.304167
10,Dry,2G,Main Warehouse,Pick,2022-10-25,3578.0,4379,0.817081,163731.426,7.0,276143.541386
17,Dry,6V,Main Warehouse,Reserve,2022-12-10,6484.0,10385,0.624362,233083.9197,117.0,361629.133332


In [17]:
grouped = final.groupby(['AREA', 'BRNCH_CD', 'DATE_EXTRACT']).agg(np.sum).reset_index()
grouped['CAPACITY'] = grouped['SUM(PALLET_USED)'] / grouped['SUM(PALLET_POSITIONS)']
grouped.head()

Unnamed: 0,AREA,BRNCH_CD,DATE_EXTRACT,SUM(PALLET_USED),SUM(PALLET_POSITIONS),CAPACITY,CASES_SOLD,SPOILAGE,MAX_WKLY_INVENTORY
0,Dry,2G,2022-01-01,8339.0,11102,0.751126,380230.9818,537.0,886788.4
1,Dry,2G,2022-01-02,8309.0,11076,0.750181,380230.9818,537.0,886788.4
2,Dry,2G,2022-03-07,9323.0,11209,0.831742,624663.7284,4.0,1140677.0
3,Dry,2G,2022-03-08,9109.0,10951,0.831796,624663.7284,4.0,1140677.0
4,Dry,2G,2022-03-09,9173.0,11061,0.82931,624663.7284,4.0,1140677.0


In [18]:
long = final.drop(['SUM(PALLET_USED)', 'SUM(PALLET_POSITIONS)', 'CAPACITY'], axis=1)
long = long.groupby(['AREA', 'BRNCH_CD', 'DATE_EXTRACT']).agg(np.sum).reset_index()
long = long.merge(grouped[['AREA', 'BRNCH_CD', 'DATE_EXTRACT', 'CAPACITY']], how='left', on=['AREA', 'BRNCH_CD', 'DATE_EXTRACT'])
long.head()

Unnamed: 0,AREA,BRNCH_CD,DATE_EXTRACT,CASES_SOLD,SPOILAGE,MAX_WKLY_INVENTORY,CAPACITY
0,Dry,2G,2022-01-01,380230.9818,537.0,886788.4,0.751126
1,Dry,2G,2022-01-02,380230.9818,537.0,886788.4,0.750181
2,Dry,2G,2022-03-07,624663.7284,4.0,1140677.0,0.831742
3,Dry,2G,2022-03-08,624663.7284,4.0,1140677.0,0.831796
4,Dry,2G,2022-03-09,624663.7284,4.0,1140677.0,0.82931


In [19]:
wide = final.drop(['SUM(PALLET_USED)', 'SUM(PALLET_POSITIONS)', 'CAPACITY'], axis=1)
wide = wide.pivot_table(index=['DATE_EXTRACT', 'AREA', 'BRNCH_CD'], columns=['STORAGE_TYPE', 'PICK_TYPE'], 
           values=['CASES_SOLD', 'SPOILAGE', 'MAX_WKLY_INVENTORY']).reset_index().fillna(0)
wide = wide.merge(grouped[['AREA', 'BRNCH_CD', 'DATE_EXTRACT', 'CAPACITY']], how='left', on=['AREA', 'BRNCH_CD', 'DATE_EXTRACT'])
wide = wide.drop(wide.columns[3:6], axis=1)
wide


merging between different levels can give an unintended result (3 levels on the left,1 on the right)



Unnamed: 0,AREA,BRNCH_CD,DATE_EXTRACT,"(CASES_SOLD, 9999, Pick)","(CASES_SOLD, 9999, Reserve)","(CASES_SOLD, Inactive, Pick)","(CASES_SOLD, Inactive, Reserve)","(CASES_SOLD, Main Warehouse, Pick)","(CASES_SOLD, Main Warehouse, Reserve)","(CASES_SOLD, Off-Site Storage, Pick)","(CASES_SOLD, Off-Site Storage, Reserve)","(CASES_SOLD, Trailer, Pick)","(CASES_SOLD, Trailer, Reserve)","(CASES_SOLD, Virtual Slot, Pick)","(CASES_SOLD, Virtual Slot, Reserve)","(MAX_WKLY_INVENTORY, 9999, Pick)","(MAX_WKLY_INVENTORY, 9999, Reserve)","(MAX_WKLY_INVENTORY, Inactive, Pick)","(MAX_WKLY_INVENTORY, Inactive, Reserve)","(MAX_WKLY_INVENTORY, Main Warehouse, Pick)","(MAX_WKLY_INVENTORY, Main Warehouse, Reserve)","(MAX_WKLY_INVENTORY, Off-Site Storage, Pick)","(MAX_WKLY_INVENTORY, Off-Site Storage, Reserve)","(MAX_WKLY_INVENTORY, Trailer, Pick)","(MAX_WKLY_INVENTORY, Trailer, Reserve)","(MAX_WKLY_INVENTORY, Virtual Slot, Pick)","(MAX_WKLY_INVENTORY, Virtual Slot, Reserve)","(SPOILAGE, 9999, Pick)","(SPOILAGE, 9999, Reserve)","(SPOILAGE, Inactive, Pick)","(SPOILAGE, Inactive, Reserve)","(SPOILAGE, Main Warehouse, Pick)","(SPOILAGE, Main Warehouse, Reserve)","(SPOILAGE, Off-Site Storage, Pick)","(SPOILAGE, Off-Site Storage, Reserve)","(SPOILAGE, Trailer, Pick)","(SPOILAGE, Trailer, Reserve)","(SPOILAGE, Virtual Slot, Pick)","(SPOILAGE, Virtual Slot, Reserve)",CAPACITY
0,Dry,2G,2022-01-01,0.0,0.0000,0.0,0.0,126743.6606,126743.6606,0.0,0.0000,0.0,0.0,126743.6606,0.0000,0.0,0.000000,0.0,0.0,295596.129171,295596.129171,0.0,0.000000,0.0,0.0,295596.129171,0.000000,0.0,0.000000,0.0,0.0,179.000000,179.000000,0.0,0.0,0.0,0.0,179.000000,0.000000,0.751126
1,Dry,2I,2022-01-01,0.0,432281.5961,0.0,0.0,432281.5961,432281.5961,0.0,0.0000,0.0,0.0,432281.5961,432281.5961,0.0,931039.531107,0.0,0.0,931039.531107,931039.531107,0.0,0.000000,0.0,0.0,931039.531107,931039.531107,0.0,235.916667,0.0,0.0,235.916667,235.916667,0.0,0.0,0.0,0.0,235.916667,235.916667,0.743298
2,Dry,2J,2022-01-01,0.0,0.0000,0.0,0.0,169059.8543,169059.8543,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.000000,0.0,0.0,358565.955001,358565.955001,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,6.000000,6.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.770072
3,Dry,2L,2022-01-01,0.0,52477.7926,0.0,0.0,52477.7926,52477.7926,0.0,52477.7926,0.0,0.0,0.0000,0.0000,0.0,164865.731669,0.0,0.0,164865.731669,164865.731669,0.0,164865.731669,0.0,0.0,0.000000,0.000000,0.0,26.000000,0.0,0.0,26.000000,26.000000,0.0,26.0,0.0,0.0,0.000000,0.000000,0.760023
4,Dry,2O,2022-01-01,0.0,246367.8593,0.0,0.0,246367.8593,246367.8593,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,549087.465004,0.0,0.0,549087.465004,549087.465004,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,1009.008333,0.0,0.0,1009.008333,1009.008333,0.0,0.0,0.0,0.0,0.000000,0.000000,0.702026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51519,Refrigerated,9L,2023-01-01,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,495.617857,495.617857,0.0,0.0,0.0,0.0,495.617857,0.000000,0.371048
51520,Refrigerated,9O,2023-01-01,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,256.750000,256.750000,0.0,0.0,0.0,0.0,256.750000,0.000000,0.664881
51521,Refrigerated,9P,2023-01-01,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,2429.458333,0.0,0.0,2429.458333,2429.458333,0.0,0.0,0.0,0.0,2429.458333,0.000000,0.456618
51522,Refrigerated,9Q,2023-01-01,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,1729.683333,0.0,0.0,1729.683333,1729.683333,0.0,0.0,0.0,0.0,1729.683333,0.000000,0.566838


# Correlation over time between predictors

In [20]:
temp = final.groupby('DATE_EXTRACT').agg(np.sum).reset_index()
x = temp['CAPACITY']
y = temp['CASES_SOLD']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [23]:
temp = final.groupby('DATE_EXTRACT').agg(np.sum).reset_index()
x = temp['CAPACITY']
y = temp['SUM(PALLET_POSITIONS)']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [None]:
x = temp['CAPACITY']
y = temp['SPOILAGE']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [None]:
x = temp['CAPACITY']
y = temp['MAX_WKLY_INVENTORY']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [None]:
x = temp['CASES_SOLD']
y = temp['SPOILAGE']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [None]:
x = temp['CASES_SOLD']
y = temp['MAX_WKLY_INVENTORY']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

In [None]:
x = temp['SPOILAGE']
y = temp['MAX_WKLY_INVENTORY']
lag = np.arange(-len(x) + 1, len(x))
coefs = correlate(x-x.mean(), y-y.mean(), mode='full') / (np.std(x) * np.std(y) * len(x))
px.line(x=lag, y=coefs, labels={'x':'Lag', 'y':'Correlation Coef'}, 
        title=('Correlation between ' + x.name + ' and ' + y.name),
        range_y=[-1, 1])

# Covered date ranges and branches after removing missing data

### Not using data before 2022 because there is no spoilage data for that date range

In [None]:
pre_2022_spoilage = merged[merged['DATE_EXTRACT'] < '2022-01-01']['SPOILAGE']
print(sum(pre_2022_spoilage.isna()))
print(pre_2022_spoilage.shape)

In [None]:
print('New data date range: ', long['DATE_EXTRACT'].min(), ', ', long['DATE_EXTRACT'].max(), sep='')
original_slots = pd.read_csv("./Slot Utilization.csv")
original_slots['DATE_EXTRACT'] = pd.to_datetime(original_slots['DATE_EXTRACT'])
print('Slot utilization data date range: ', original_slots['DATE_EXTRACT'].min(), ', ', original_slots['DATE_EXTRACT'].max(), sep='')

# Regression model using newly aggregated data

### Using aggregated cases/inventory/spoilage data

In [None]:
x_train_agg, x_test_agg, y_train_agg, y_test_agg = train_test_split(long.drop(['CAPACITY', 'DATE_EXTRACT'], axis=1), long['CAPACITY'], 
                                                    random_state=12345)
pd.get_dummies(x_train_agg) # showing sample of features with dummy variables generated (done in pipeline in practice)

In [None]:
pipe = Pipeline([('create dummies', OneHotEncoder()),
                 ('robust scaling', RobustScaler(with_centering=False)),
                 ('lm', LinearRegression())])
pipe.fit(x_train_agg, y_train_agg)

### Using disaggregated cases/inventory/spoilage data

In [None]:
x_train_disagg, x_test_disagg, y_train_disagg, y_test_disagg = train_test_split(wide.drop(['CAPACITY', 'DATE_EXTRACT'], axis=1), wide['CAPACITY'], 
                                                    random_state=12345)
pd.get_dummies(x_train_disagg)

# Evaluating new model(s)