In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pprint as pp
import IPython
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
CSS = """
.output {
    flex-direction: row;
}
"""
HTML('<style>{}</style>'.format(CSS))

In [2]:
def method_check(obj):
    a_list = []
    for method_name in dir(obj):
        if callable(getattr(obj, method_name)) & ~method_name.startswith('_'):
            a_list.append(method_name)
    return a_list

def attribute_check(obj):
    a_list = []
    for method_name in dir(obj):
        if callable(getattr(obj, method_name)) & method_name.startswith('_'):
            a_list.append(method_name)
    return a_list

ac = lambda x : print(IPython.utils.text.columnize(attribute_check(x)))
mc = lambda x : print(IPython.utils.text.columnize(method_check(x)))

---

## Cleaning/Mapping for tidy data.

Contained is primary dataset (`../0826export.csv`) which we will prepare for analysis. To help with this task we also have have another data set(`../store_master.xlsx`) which we will merge, map onto, and otherwise tidy up our primary dataset with. 


In [37]:
df1 = pd.read_csv('./data_genesco/0826export.csv')
df1.columns = df1.columns.str.lower()

df2 = pd.read_excel('./data_genesco/store_master.xlsx')
df2.columns = df2.columns.str.lower()

In [39]:
# column creation and mapping
#############################
# store class column
df2_store_dict = dict(zip(df2['store'], df2['class']))
df1['class'] = df1['store'].map(df2_store_dict)

# store city column 
df2_city_dict = dict(zip(df2['store'], df2['store_city']))
df1['city'] = df1['store'].map(df2_city_dict)

# store state column
df2_state_dict = dict(zip(df2['store'], df2['store_state']))
df1['state'] = df1['store'].map(df2_state_dict)

# store close date
df2_closed_dict = dict(zip(df2['store'], df2['close_date']))
df1['close_date'] = df1['store'].map(df2_closed_dict)

df2_ft_dict = dict(zip(df2['store'], df2['gross_feet']))
df1['gross_feet'] = df1['store'].map(df2_ft_dict)

In [50]:
# df1.dtypes.value_counts()
df1.select_dtypes('O')
# df1.select_dtypes('int64')['fiscal_week']
# df1.select_dtypes('float64').columns


Unnamed: 0,week_end_date,week_end_date_ly,class,city,state,close_date
0,06-FEB-16,07-FEB-15,Mall,LAS VEGAS,NV,
1,13-FEB-16,14-FEB-15,Mall,LAS VEGAS,NV,
2,20-FEB-16,21-FEB-15,Mall,LAS VEGAS,NV,
3,27-FEB-16,28-FEB-15,Mall,LAS VEGAS,NV,
4,05-MAR-16,07-MAR-15,Mall,LAS VEGAS,NV,
5,12-MAR-16,14-MAR-15,Mall,LAS VEGAS,NV,
6,19-MAR-16,21-MAR-15,Mall,LAS VEGAS,NV,
7,26-MAR-16,28-MAR-15,Mall,LAS VEGAS,NV,
8,02-APR-16,04-APR-15,Mall,LAS VEGAS,NV,
9,09-APR-16,11-APR-15,Mall,LAS VEGAS,NV,


109

In [5]:
# df_closed
df_closed = df1[df1['close_date'].notnull()].drop('div', axis=1)
# df_closed = df_closed.groupby(['store','fiscal_year','fiscal_month'])['comp_pct']
# df_closed = df_closed.groupby('store')
# df_closed.head()
df_closed['week_end_date'] = df_closed['week_end_date'].apply(lambda x: pd.to_datetime(x))

---

## Selecing Column Variables for doing a correlation

In [274]:
# new = closed_stripped_df.groupby(['store','fiscal_year'])['comp_pct'].mean().unstack().reset_index()
# new = closed_stripped_df.groupby(['store','fiscal_year']).sum()
# new = df_closed[:, [1,3,8,15,16,17,18,19,20, 27,28,29,30,31,32,33,34,35,36,55,56,57,58,59,60,61,
#                    62,63,64,65,66,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,99,
#                   100,101,102,103,104,105,106,107,108]]

# df_closed

---

## Grouping TEST CELL BELOW

In [None]:

####################################
####################################

# df_pivot = df1.pivot_table(values='comp_pct', index=['store'],columns='fiscal_year')#.nlargest(10, 'fiscal_year')
# df_pivot['comp_pct_sum'] = df_pivot[2017] + df_pivot[2018] + df_pivot[2019] + df_pivot[2020]
# df_pivot.nsmallest(10, 'comp_pct_sum')
    
    
####################################
####################################

# df1.groupby('store')['comp_pct'].sum().nsmallest(10)
# df1[['gross_feet', 'comp_pct']].groupby('comp_pct')['gross_feet'].value_counts(bins=20)#.plot(kind='scatter');
# df1[['gross_feet', 'comp_pct']].groupby('gross_feet').mean().hist(bins=40);

In [130]:
# grouped_class_comp_sum = df1.groupby(['class'])['comp_pct'].sum() # keep these
# grouped_class_comp_mean = df1.groupby(['class'])['comp_pct'].mean() # keep these

# grouped_store_comp_sum = df1.groupby(['store'])['comp_pct'].sum() # keep these
# grouped_store_comp_mean = df1.groupby(['store'])['comp_pct'].mean() # keep these

grouped_store = df_closed.groupby(['fiscal_year','store'])[['comp_pct', 
                                                            'strak_transaction_num', 
                                                            'tot_avg_tenure_days','city',
                                                            'state','gross_feet']].sum()
# grouped_city_comp_sum.nlargest(10)
# grouped_class_comp_sum.nlargest(10)

# grouped_store.index
grouped_store.xs((2017, 1643), axis=0)

comp_pct                    -8.341135
strak_transaction_num     3561.000000
tot_avg_tenure_days       4206.000000
gross_feet               43378.000000
Name: (2017, 1643), dtype: float64

In [69]:
# grouped_store_comp_mean.nsmallest(10)
# grouped_store_comp_sum.nsmallest(10)
# mc(pd.core.groupby.generic.DataFrameGroupBy)
dir(grouped_store.index)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getslice__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__init__',
 '__init_subclass__',
 '__inv__',
 '__isub__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmul__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__unicode__',
 '__weakref__',
 '__xor__',
 '_accessors',
 '_add_comparison_methods',
 '_add_logical_methods',

In [None]:
grouped_store_pct_largest = df1.groupby(['store'])['comp_pct', 
                                          'accessory_units',
                                          'trans_cnt_sales',
                                          'trans_cnt_total',
                                          'accessory_units'].sum().nlargest(10, 'comp_pct')

grouped_store_pct_smallest = df1.groupby(['store'])['comp_pct', 
                                          'accessory_units',
                                          'trans_cnt_sales',
                                          'trans_cnt_total',
                                          'accessory_units'].sum().nsmallest(10, 'comp_pct')

In [None]:
# grouped_store_pct_largest.corr()
# grouped_store_pct_smallest.corr()

In [None]:
# grouped_store_pct_largest
# grouped_store_pct_smallest

In [401]:
grouped_store_pct_smallest = df1.groupby(['store'])['comp_pct', 
                                          'accessory_units',
                                          'trans_cnt_sales',
                                          'trans_cnt_total',
                                          'accessory_units'].sum()

grouped_store_pct_smallest.nsmallest(10, 'comp_pct')

Unnamed: 0_level_0,comp_pct,accessory_units,trans_cnt_sales,trans_cnt_total,accessory_units
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1643,-13.52922,6449,5682,6763,6449
1803,-12.364402,17716,10311,12015,17716
1817,-11.544135,29302,14335,16638,29302
1600,-10.686017,13721,12220,14429,13721
2067,-8.856854,29207,16848,18444,29207
1717,-7.597681,21186,12396,12850,21186
1504,-6.772227,27988,14272,15371,27988
1849,-6.053967,4412,3127,3912,4412
1672,-4.96744,14585,10132,12204,14585
1854,-4.891813,73684,39649,44854,73684
