In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pprint as pp
import IPython
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
CSS = """
.output {
    flex-direction: row;
}
"""
HTML('<style>{}</style>'.format(CSS))
pd.set_option('display.max_columns', None)

---

## Mapping categories to DF1 and removing outliers below

In [2]:
df1 = pd.read_csv('./data/0826export_column_filter_1.csv')
df1.columns = df1.columns.str.lower()

df2 = pd.read_excel('./documentation/store_master.xlsx')
df2.columns = df2.columns.str.lower()

categories_to_map = ['class', 
                     'store_city', 
                     'store_state', 
                     'close_date', 
                     'gross_feet'] # must be df2's LOWERED column names

def map_my_dataframe(df1, df2, a_list):
    """Returns DataFrame(df1)
    maps str items in a_list, from df2 onto df1.
    the joint variable for mapping is the store number.
    
    !! df1 and df2 column names must be lower-case !!
    """
    joint_var = 'store'
    for category in a_list:
        
        df_2_dict = dict(zip(df2[joint_var], df2[category]))
        
        df1[category] = df1[joint_var].map(df_2_dict)
        
    return df1
    
df = map_my_dataframe(df1, df2, categories_to_map)
df = df[df.store != 2084]

closed = df[df.close_date.notna()]

---

### Remove outlier store(2084), make DF for closed stores.

---

## Making dataframes to concat and compare below by `store`:

Grouping DataFrames's by `'store'`, `'mean()'` of:
 - `df1`: full original data frame
 - `top_avg`: top by `comp_pct`
 - `bot_avg`: bottom by `comp_pct`
 - `closed_avg`: by closed store

In [3]:
# all_avg = df.groupby(['store','fiscal_year','fiscal_month']).mean().median().round(3)
# top_avg = df.groupby(['store','fiscal_year','fiscal_month']).mean().nlargest(27, 'comp_pct').median().round(3)
# bot_avg = df.groupby(['store','fiscal_year','fiscal_month']).mean().nsmallest(27, 'comp_pct').median().round(3)
# closed_avg = closed.groupby(['store','fiscal_year','fiscal_month']).mean().median().round(3)

# # Concatenating
# for_compare = pd.concat([
#     all_avg.rename('all_means').to_frame(),
#     top_avg.rename('top_means').to_frame(),
#     bot_avg.rename('bot_means').to_frame(),
#     closed_avg.rename('closed_means').to_frame()
# ], axis=1).iloc[1:, :]

# for_compare

all_avg_by_store = df.groupby(['store']).mean().mean().round(4)
top_avg_by_store = df.groupby(['store']).mean().nlargest(20, 'comp_pct').mean().round(4)
bot_avg_by_store = df.groupby(['store']).mean().nsmallest(20, 'comp_pct').mean().round(4)
closed_avg_by_store = closed.groupby(['store']).mean().mean().round(4)

# Concatenating
for_compare_by_store = pd.concat([
#     all_avg_by_store.rename('all_means').to_frame(),
    top_avg_by_store.rename('top_store_avgs').to_frame(),
    bot_avg_by_store.rename('bot_store_avgs').to_frame(),
    closed_avg_by_store.rename('closed_store_avgs').to_frame()
], axis=1).iloc[3:, :]

for_compare_by_store

Unnamed: 0,top_store_avgs,bot_store_avgs,closed_store_avgs
comp_pct,0.1587,-0.0535,-0.0148
mark_down_amt_ty,2124.6918,1711.4393,1144.2139
payroll_adj_hours,143.3224,129.9134,106.2597
strak_traffic,1084.4834,905.5472,779.3127
strak_sales_amt,20370.378,16344.4602,15008.3269
special_order_amt,3087.8884,2167.4445,2039.8378
sales_value,20368.617,16340.8817,15003.0004
shoes_units,94.8406,80.0852,65.7381
shoes_value,12554.7852,10419.3385,9758.1475
multi_value,5018.2321,4499.7414,3764.7171


---
## Making dataframes to concat and compare below by `class`:

### Commented out classes do not have enough stores to do top and bot 10 store difference checks

In [4]:
classes = df['class'].unique()

mall_top = df[df['class'] == classes[0]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('mall_top').to_frame()
mall_bot = df[df['class'] == classes[0]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('mall_bot').to_frame()

# open_air_top = df[df['class'] == classes[1]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('open_air_top').to_frame()
# open_air_bot = df[df['class'] == classes[1]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('open_air_bot').to_frame()

# downtown_top = df[df['class'] == classes[2]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('downtown_top').to_frame()
# downtown_bot = df[df['class'] == classes[2]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('downtown_bot').to_frame()

# street_top = df[df['class'] == classes[3]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('street_top').to_frame()
# street_bot = df[df['class'] == classes[3]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('street_bot').to_frame()

airport_top = df[df['class'] == classes[4]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('airport_top').to_frame()
airport_bot = df[df['class'] == classes[4]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('airport_bot').to_frame()

# lifestyle_top = df1[df1['class'] == classes[5]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('lifestyle_top').to_frame()
# lifestyle_bot = df1[df1['class'] == classes[5]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('lifestyle_bot').to_frame()

outlet_top = df[df['class'] == classes[6]].groupby(['store']).mean().nlargest(10, 'comp_pct').iloc[:, 3:].mean().rename('outlet_top').to_frame()
outlet_bot = df[df['class'] == classes[6]].groupby(['store']).mean().nsmallest(10, 'comp_pct').iloc[:, 3:].mean().rename('outlet_bot').to_frame()

for_compare_by_class = pd.concat([mall_top, mall_bot, airport_top, airport_bot, outlet_top, outlet_bot], axis=1)
# for_compare_by_class

---

## MALL STORES:
 - #### Absolute percentage(%) differences between averages of top and bot mall stores' variables:

In [5]:
abs(abs(mall_bot.iloc[1:,0]-mall_top.iloc[1:,0]) / ((mall_bot.iloc[1:,0] + mall_top.iloc[1:,0]) * 0.5) * 100).to_frame('abs_%_diff')

Unnamed: 0,abs_%_diff
mark_down_amt_ty,25.493565
payroll_adj_hours,1.693765
strak_traffic,7.404775
strak_sales_amt,23.350967
special_order_amt,35.176158
sales_value,23.348071
shoes_units,25.909604
shoes_value,27.212221
multi_value,33.870609
payroll_units,0.937128


---

## AIRPORT STORES:
 - #### Absolute percentage(%) differences between averages of top and bot airport stores' variables:

In [6]:
abs(abs(airport_bot.iloc[1:,0]-airport_top.iloc[1:,0]) / ((airport_bot.iloc[1:,0] + airport_top.iloc[1:,0]) * 0.5) * 100).to_frame('abs_%_diff')

Unnamed: 0,abs_%_diff
mark_down_amt_ty,33.349293
payroll_adj_hours,18.660824
strak_traffic,30.95637
strak_sales_amt,30.115526
special_order_amt,19.93065
sales_value,30.088844
shoes_units,30.580276
shoes_value,28.356516
multi_value,32.473314
payroll_units,19.255683


---

## OUTLET STORES: 
 - #### Absolute percentage(%) differences between averages of top and bot outlet stores' variables:

In [7]:
# outlet store differences
abs(abs(outlet_bot.iloc[1:,0]-outlet_top.iloc[1:,0]) / ((outlet_bot.iloc[1:,0] + outlet_top.iloc[1:,0]) * 0.5) * 100).to_frame('abs_%_diff')

Unnamed: 0,abs_%_diff
mark_down_amt_ty,2.549816
payroll_adj_hours,13.399895
strak_traffic,12.273005
strak_sales_amt,16.188806
special_order_amt,10.274374
sales_value,16.211427
shoes_units,19.149166
shoes_value,20.087996
multi_value,11.542833
payroll_units,4.084939


---

## Making dataframes to concat and compare below by `fiscal_week`, top and bot `comp_pct`.

Each observation within `df1` is `fiscal_week`, and total len is 27042. We will sample top and bot 1000 fiscal weeks by `comp_pct`, then compare the variable of these fiscal weeks with side by side.

In [8]:
fiscal_wk_top = df.set_index(['store', 'fiscal_week']).nlargest(1000, 'comp_pct').iloc[:,2:]
fiscal_wk_bot = df.set_index(['store', 'fiscal_week']).nsmallest(1000, 'comp_pct').iloc[:,2:]


for_compare_by_fiscal_week = pd.concat([fiscal_wk_top.mean().rename('top_comp_pct_weeks'), 
                                        fiscal_wk_bot.mean().rename('bot_comp_pct_weeks')], axis=1)
for_compare_by_fiscal_week

Unnamed: 0,top_comp_pct_weeks,bot_comp_pct_weeks
comp_pct,0.956945,-0.436951
mark_down_amt_ty,2315.72586,1167.42096
payroll_adj_hours,130.121,118.318
strak_traffic,950.349,704.071
strak_sales_amt,19588.11957,10442.04794
special_order_amt,2834.25166,1497.46008
sales_value,19585.37486,10423.96134
shoes_units,102.545,56.047
shoes_value,12795.6509,6951.2794
multi_value,5961.89614,2716.02131


---

## FISCAL WEEKS: 
 - #### Absolute percentage(%) differences between averages of top and bot fiscal weeks' variables:

In [9]:
wk_top = fiscal_wk_top.mean().rename('top_comp_pct_weeks')
wk_bot = fiscal_wk_bot.mean().rename('bot_comp_pct_weeks')

abs(abs(wk_top-wk_bot) / ((wk_top+wk_bot) * 0.5) * 100).to_frame('abs_%_diff')

Unnamed: 0,abs_%_diff
comp_pct,536.120464
mark_down_amt_ty,65.934912
payroll_adj_hours,9.501729
strak_traffic,29.772126
strak_sales_amt,60.912558
special_order_amt,61.72117
sales_value,61.057089
shoes_units,58.638519
shoes_value,59.192709
multi_value,74.807691
