In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import IPython
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
# CSS = """
# .output {
#     flex-direction: row;
# }
# """
# HTML('<style>{}</style>'.format(CSS))
# pd.set_option('display.max_columns', None)

In [2]:
main_df = pd.read_csv('./data/0826export.csv')
main_df.columns = main_df.columns.str.lower()

store_master_df = pd.read_excel('./documentation/store_master.xlsx')
store_master_df.columns = store_master_df.columns.str.lower()

---

## Mapping Function:

#### To map from `/store_master.xlsx` onto `main_df`

In [3]:
def map_my_dataframe(main_df, store_master_df, a_list, map_on='store') -> 'pandas.DataFrame':
    """
    > a_list = df2 column names' values(for mapping to df1)
    > rids of outlier comp_pct
    > converts descriptive tidy columns to dtype('category')
    """
    
    for category in a_list:
        df_2_dict = dict(zip(store_master_df[map_on], store_master_df[category]))
        main_df[category] = main_df[map_on].map(df_2_dict)
        
    main_df['store_size'] = pd.cut(main_df.gross_feet, bins=[0, 1000, 2000, 3000, 4000], labels=['sml', 'mdm', 'lrg', 'xlrg'])
    main_df.drop(main_df.comp_pct.values.argmax(), inplace=True) # Large outlier, drop fiscal_week with comp_pct > 200.
    main_df.drop(main_df.comp_pct.values.argmax(), inplace=True) # Large outlierr, drop fiscal_week with comp_pct > 20
    main_df.drop('store_zip', axis=1, inplace=True)
#     main_df.iloc[:, -4:] = main_df.iloc[:, -4:].astype('category') # convert last 4 tidy columns to dtype('category')
    
    return main_df  

df = map_my_dataframe(main_df, store_master_df, store_master_df.columns[2:], map_on='store')

In [16]:
def top_bot_var_compare(df, category, rows) -> "DataFrame[['diff%', 'top', 'bot']].sort_values()":
    
    levels_list = [category, 'store', 'fiscal_year', 'fiscal_month']
    cat_list = df[category].unique()
    
    df = df.groupby(levels_list).mean()
    dict_df = {}
    
    for cat in cat_list:
        
        cat_df = df.loc[[cat]].iloc[:, 2:].droplevel(levels_list[1:])
#         cat_df.drop(['fiscal_year_ly','fiscal_year_ty', 'fiscal_month', 'fiscal_week'], axis=1)
        
        sample_percent = int(.15 * len(cat_df))
        
        top_df = round(cat_df.nlargest(sample_percent, 'comp_pct').mean(), 3).rename('mean_top')
        bot_df = round(cat_df.nsmallest(sample_percent, 'comp_pct').mean(), 3).rename('mean_bot')
        diff_df = abs(abs(top_df - bot_df) / ((top_df + bot_df) * 0.5) * 100).rename('mean_%diff')
        
#         idx = pd.Index()
        together = pd.concat([diff_df, top_df, bot_df], axis=1)#.rename(index={1: 'FUCK'})
        together = together.sort_values('mean_%diff', ascending=False)
        together.index.names = [f"{cat} Top Vars"]
        together = together.iloc[2:, :].reset_index()
        dict_df[cat] = together.iloc[:rows, :]
    
    
    return pd.concat(dict_df.values(), axis=1, keys=dict_df.keys())

top_bot_var_compare(df, 'class', rows=10)

Unnamed: 0_level_0,Mall,Mall,Mall,Mall,Open Air,Open Air,Open Air,Open Air,Downtown,Downtown,...,Airport,Airport,Lifestyle,Lifestyle,Lifestyle,Lifestyle,Outlet,Outlet,Outlet,Outlet
Unnamed: 0_level_1,Mall Top Vars,mean_%diff,mean_top,mean_bot,Open Air Top Vars,mean_%diff,mean_top,mean_bot,Downtown Top Vars,mean_%diff,...,mean_top,mean_bot,Lifestyle Top Vars,mean_%diff,mean_top,mean_bot,Outlet Top Vars,mean_%diff,mean_top,mean_bot
0,cp_total_email_trans,47.187131,590.31,954.874,avg_ft_tenure_days,102.758364,987.155,317.06,ft_count,101.176471,...,2.165,0.966,avg_ft_tenure_days,64.640483,1268.4,648.767,ft_tenure_days,37.55255,347.575,508.271
1,cp_total_trans,46.573844,694.131,1115.55,ft_tenure_days,102.758364,987.155,317.06,accessory_other_units,94.212454,...,4.611,2.126,ft_tenure_days,64.640483,1268.4,648.767,avg_ft_tenure_days,36.81379,231.365,335.754
2,cp_valid_mailing_address,46.318965,580.99,931.207,ft_count,62.656642,0.524,0.274,multi_value,89.371802,...,1858.629,994.636,cp_total_trans,56.606975,4263.859,7630.329,trans_cnt_exchg_in,35.302059,3.486,2.44
3,cp_valid_emails,46.226499,379.513,607.687,avg_am_tenure_days,56.403583,284.714,508.381,accessory_shoe_trees_units,88.418115,...,1414.131,783.42,cp_valid_emails,56.561621,2141.819,3830.973,mark_down_amt_ly,34.016126,2128.566,3001.007
4,avg_ft_tenure_days,44.453794,344.732,541.775,am_tenure_days,56.403583,284.714,508.381,accessory_shoe_trees_value,87.089541,...,4009.518,2223.916,cp_total_email_trans,55.937859,3589.972,6377.871,accessory_coats_units,32.410505,8.164,5.887
5,ft_tenure_days,39.335751,481.183,716.801,trans_cnt_exchg_out,55.054432,0.41,0.233,multi_units,85.96508,...,519.983,914.242,cp_valid_mailing_address,55.123693,3567.369,6282.051,accessory_shoe_care_value,31.822999,259.771,188.452
6,mark_down_amt_ly,31.504426,1051.853,1445.193,accessory_shirt_sweater_value,54.183419,2164.877,1241.918,trans_cnt_exchg_in,85.390363,...,4113.656,2374.773,mark_down_amt_ly,45.144971,711.675,1126.625,trans_cnt_exchg_out,31.231231,0.385,0.281
7,multi_value,30.14579,5264.677,3885.483,am_count,49.407115,0.631,0.381,special_order_amt,82.981737,...,585.918,1013.422,tot_avg_tenure_days,35.282244,2390.617,1673.637,mark_down_amt_ty,31.140777,3329.554,2432.396
8,accessory_shoe_care_value,29.896595,367.615,272.003,multi_value,48.239433,4142.166,2532.303,accessory_coats_value,82.12886,...,304.174,505.232,trans_cnt_empl_returns,33.333333,0.07,0.05,multi_units,30.519475,68.549,50.398
9,multi_units,29.081259,35.176,26.245,accessory_shirt_sweater_units,46.97787,32.393,20.07,accessory_plgs_luggage_value,81.200117,...,433.376,718.879,tot_tenure_days,32.04014,7693.683,5568.993,accessory_shoe_care_units,30.306319,42.442,31.272


---

## By Class:
#### `['Mall', 'Open Air', 'Downtown', 'Street', 'Airport', 'Lifestyle','Outlet']`

In [None]:
df.reset_index?

In [None]:
df.rename?

In [None]:
diff_mall = abs(abs(bot - top) / ((bot + top) * 0.5) * 100).to_frame('abs_%_diff').nlargest(11, 'abs_%_diff').iloc[1:, :]

In [None]:
category = 'class'

level_list = [category,'store', 'fiscal_year', 'fiscal_month']

test = df.groupby(level_list).mean().loc[['Mall']].iloc[:, 2:]

test1 = test.nlargest(int(.15 * len(test)), 'comp_pct')
# test1 = test1.droplevel(level_list[1:])

# pivot = data.pivot_table(values=["rain(mm)", "temp(dC)"],
#                          index=['loc','month'], aggfunc='mean')

In [None]:
df['class'].unique()

In [None]:
diff_mall = abs(abs(bot - top) / ((bot + top) * 0.5) * 100).to_frame('abs_%_diff').nlargest(11, 'abs_%_diff').iloc[1:, :]