## sidetable

This library makes it easy to build a frequency table and simple summary of missing values in a DataFrame. This is a useful tool when starting data exploration on a new data set. <br>
[reference](https://pbpython.com/sidetable.html)

In [1]:
import pandas as pd
import sidetable

df = pd.read_csv('https://github.com/chris1610/pbpython/blob/master/data/school_transform.csv?raw=True', index_col=0)
df

Unnamed: 0,School Name,City,State,District Name,Model Selected,Award_Amount,Region
0,HOGARTH KINGEEKUK MEMORIAL SCHOOL,SAVOONGA,AK,BERING STRAIT SCHOOL DISTRICT,Transformation,471014,West
1,AKIACHAK SCHOOL,AKIACHAK,AK,YUPIIT SCHOOL DISTRICT,Transformation,520579,West
2,GAMBELL SCHOOL,GAMBELL,AK,BERING STRAIT SCHOOL DISTRICT,Transformation,449592,West
3,BURCHELL HIGH SCHOOL,WASILLA,AK,MATANUSKA-SUSITNA BOROUGH SCHOOL DISTRICT,Transformation,641184,West
4,AKIAK SCHOOL,AKIAK,AK,YUPIIT SCHOOL DISTRICT,Transformation,399686,West
...,...,...,...,...,...,...,...
826,MOUNT HOPE HIGH SCHOOL,MOUNT HOPE,WV,FAYETTE COUNTY SCHOOLS,Transformation,342189,South
827,GEARY ELEMENTARY SCHOOL,LEFT HAND,WV,ROANE COUNTY SCHOOLS,Transformation,260000,South
828,TRIUMPH HIGH SCHOOL,CHEYENNE,WY,LARAMIE COUNTY SCHOOL DISTRICT #1,Transformation,297300,West
829,HEM JUNIOR/SENIOR HIGH SCHOOL,HANNA,WY,CARBON COUNTY SCHOOL DISTRICT #2,Transformation,274000,West


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 757 entries, 0 to 830
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   School Name     757 non-null    object
 1   City            757 non-null    object
 2   State           757 non-null    object
 3   District Name   757 non-null    object
 4   Model Selected  757 non-null    object
 5   Award_Amount    757 non-null    int64 
 6   Region          747 non-null    object
dtypes: int64(1), object(6)
memory usage: 47.3+ KB


In [3]:
pd.options.display.float_format = '{:,.2f}'.format

In [4]:
df.stb.freq(['State'])

Unnamed: 0,State,count,percent,cumulative_count,cumulative_percent
0,CA,92,12.15,92,12.15
1,FL,71,9.38,163,21.53
2,PA,58,7.66,221,29.19
3,OH,35,4.62,256,33.82
4,MO,32,4.23,288,38.04
5,MI,28,3.7,316,41.74
6,GA,26,3.43,342,45.18
7,NY,25,3.3,367,48.48
8,NC,23,3.04,390,51.52
9,SC,19,2.51,409,54.03


In [5]:
df['State'].value_counts(normalize=True).apply(lambda x:f'{x:.2%}')

CA    12.15%
FL     9.38%
PA     7.66%
OH     4.62%
MO     4.23%
MI     3.70%
GA     3.43%
NY     3.30%
NC     3.04%
MN     2.51%
AZ     2.51%
SC     2.51%
CO     2.51%
VA     2.38%
WA     2.38%
WV     1.98%
CT     1.85%
MA     1.59%
TN     1.59%
OR     1.59%
NJ     1.59%
MD     1.45%
AL     1.45%
WI     1.45%
VT     1.32%
DC     1.32%
IL     1.32%
OK     1.32%
NV     1.32%
KY     1.32%
NM     1.19%
MS     1.06%
AK     0.92%
NH     0.92%
NE     0.92%
IN     0.92%
AR     0.92%
UT     0.92%
MT     0.79%
KS     0.79%
IA     0.53%
WY     0.40%
SD     0.26%
ID     0.26%
DE     0.26%
ND     0.13%
Name: State, dtype: object

In [6]:
# What if we want a quick view of the states that contribute around 50% of the total? 
# Use the thresh argument to group all of the rest into an “Others” category:

df.stb.freq(['State'], thresh=50)

Unnamed: 0,State,count,percent,cumulative_count,cumulative_percent
0,CA,92,12.15,92,12.15
1,FL,71,9.38,163,21.53
2,PA,58,7.66,221,29.19
3,OH,35,4.62,256,33.82
4,MO,32,4.23,288,38.04
5,MI,28,3.7,316,41.74
6,GA,26,3.43,342,45.18
7,NY,25,3.3,367,48.48
8,others,390,51.52,757,100.0


In [7]:
df.stb.freq(['State'], thresh=50, other_label='Rest of States')

Unnamed: 0,State,count,percent,cumulative_count,cumulative_percent
0,CA,92,12.15,92,12.15
1,FL,71,9.38,163,21.53
2,PA,58,7.66,221,29.19
3,OH,35,4.62,256,33.82
4,MO,32,4.23,288,38.04
5,MI,28,3.7,316,41.74
6,GA,26,3.43,342,45.18
7,NY,25,3.3,367,48.48
8,Rest of States,390,51.52,757,100.0


In [8]:
# One of the useful features of sidetable is that it can group columns together to further understand the distribution. 
# For instance, what if we want to see how the various “Transformation Models” are applied across Regions?
df.stb.freq(['Region', 'Model Selected'])

Unnamed: 0,Region,Model Selected,count,percent,cumulative_count,cumulative_percent
0,South,Transformation,185,24.77,185,24.77
1,West,Transformation,142,19.01,327,43.78
2,Midwest,Transformation,111,14.86,438,58.63
3,Northeast,Transformation,102,13.65,540,72.29
4,West,Turnaround,49,6.56,589,78.85
5,South,Turnaround,44,5.89,633,84.74
6,Midwest,Turnaround,43,5.76,676,90.5
7,Northeast,Turnaround,25,3.35,701,93.84
8,South,Restart,11,1.47,712,95.31
9,Northeast,Restart,9,1.2,721,96.52


In [9]:
# total breakdown by Award Amount. sidetable allows you to pass a value column that can be summed 
# (instead of counting occurrences).

df1=df.stb.freq(['Region'], value='Award_Amount')
df1

Unnamed: 0,Region,Award_Amount,percent,cumulative_Award_Amount,cumulative_percent
0,South,117467481,37.31,117467481,37.31
1,West,74418552,23.64,191886033,60.95
2,Midwest,65736175,20.88,257622208,81.84
3,Northeast,57179654,18.16,314801862,100.0


In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Region                   4 non-null      object 
 1   Award_Amount             4 non-null      int64  
 2   percent                  4 non-null      float64
 3   cumulative_Award_Amount  4 non-null      int64  
 4   cumulative_percent       4 non-null      float64
dtypes: float64(2), int64(2), object(1)
memory usage: 192.0+ bytes


In [11]:
a=df1['Award_Amount'].dtype
print(a)

int64


In [12]:
df.columns

Index(['School Name', 'City', 'State', 'District Name', 'Model Selected',
       'Award_Amount', 'Region'],
      dtype='object')

In [13]:
for i in range (len(df.columns)):
    print(df[df.columns[i]].dtype)

object
object
object
object
object
int64
object


In [14]:
def one_thousand_sep(df):
    for i in range (len(df.columns)):
        if df[df.columns[i]].dtype=='int64':
#             print(True)
            df[df.columns[i]]=df[df.columns[i]].apply(lambda x:f'{x: ,d}')
    return df

In [15]:
df2=one_thousand_sep(df1)
df2

Unnamed: 0,Region,Award_Amount,percent,cumulative_Award_Amount,cumulative_percent
0,South,117467481,37.31,117467481,37.31
1,West,74418552,23.64,191886033,60.95
2,Midwest,65736175,20.88,257622208,81.84
3,Northeast,57179654,18.16,314801862,100.0


In [16]:
new_col=['Region', 'Award_Amount ($)', 'Percent (%)', 'Cumulative_Award_Amount ($)', 'Cumulative_Percent (%)']
df2.columns=new_col
df2

Unnamed: 0,Region,Award_Amount ($),Percent (%),Cumulative_Award_Amount ($),Cumulative_Percent (%)
0,South,117467481,37.31,117467481,37.31
1,West,74418552,23.64,191886033,60.95
2,Midwest,65736175,20.88,257622208,81.84
3,Northeast,57179654,18.16,314801862,100.0


#### This view gives us insight that the Northeast has the least amount of dollars spent on these projects and that 37% of the total spend went to schools in the South region.

In [17]:
# look at the types of models selected and determine the 80/20 breakdown of the allocated dollars

df3=df.stb.freq(['Region', 'Model Selected'], 
            value='Award_Amount', 
            thresh=80,
           other_label='Remaining')
df3

Unnamed: 0,Region,Model Selected,Award_Amount,percent,cumulative_Award_Amount,cumulative_percent
0,South,Transformation,88680032,28.17,88680032,28.17
1,West,Transformation,56207890,17.86,144887922,46.03
2,Midwest,Transformation,48702505,15.47,193590427,61.5
3,Northeast,Transformation,41263161,13.11,234853588,74.6
4,Remaining,Remaining,79948274,25.4,314801862,100.0


In [18]:
df4=one_thousand_sep(df3)
df4

Unnamed: 0,Region,Model Selected,Award_Amount,percent,cumulative_Award_Amount,cumulative_percent
0,South,Transformation,88680032,28.17,88680032,28.17
1,West,Transformation,56207890,17.86,144887922,46.03
2,Midwest,Transformation,48702505,15.47,193590427,61.5
3,Northeast,Transformation,41263161,13.11,234853588,74.6
4,Remaining,Remaining,79948274,25.4,314801862,100.0


In [19]:
s=pd.Series(df4.columns)
s

0                     Region
1             Model Selected
2               Award_Amount
3                    percent
4    cumulative_Award_Amount
5         cumulative_percent
dtype: object

In [20]:
s

0                     Region
1             Model Selected
2               Award_Amount
3                    percent
4    cumulative_Award_Amount
5         cumulative_percent
dtype: object

In [21]:
s.str.replace(r'([A-Za-z]+)_Amount', r'\1_Amount ($)', regex=True)

0                         Region
1                 Model Selected
2               Award_Amount ($)
3                        percent
4    cumulative_Award_Amount ($)
5             cumulative_percent
dtype: object

In [22]:
s.str.replace(r'([A-Za-z]+_)?percent', r'\1Percent (%)', regex=True)

0                     Region
1             Model Selected
2               Award_Amount
3                Percent (%)
4    cumulative_Award_Amount
5     cumulative_Percent (%)
dtype: object

In [23]:
def add_dollar(s):
    return s.str.replace(r'([A-Za-z]+)_Amount', r'\1_Amount ($)', regex=True)

In [24]:
df4.columns=add_dollar(df4.columns)

In [25]:
def add_percent (s):
    return s.str.replace(r'([A-Za-z]+_)?percent', r'\1Percent (%)', regex=True)

In [26]:
df4.columns=add_percent(df4.columns)

In [27]:
df4.columns

Index(['Region', 'Model Selected', 'Award_Amount ($)', 'Percent (%)',
       'cumulative_Award_Amount ($)', 'cumulative_Percent (%)'],
      dtype='object')

In [28]:
df4

Unnamed: 0,Region,Model Selected,Award_Amount ($),Percent (%),cumulative_Award_Amount ($),cumulative_Percent (%)
0,South,Transformation,88680032,28.17,88680032,28.17
1,West,Transformation,56207890,17.86,144887922,46.03
2,Midwest,Transformation,48702505,15.47,193590427,61.5
3,Northeast,Transformation,41263161,13.11,234853588,74.6
4,Remaining,Remaining,79948274,25.4,314801862,100.0


### oh... No. I didn't know that there is a "style=True" option

In [30]:
df.stb.freq(['Region'], value='Award_Amount', style=True)

Unnamed: 0,Region,Award_Amount,percent,cumulative_Award_Amount,cumulative_percent
0,South,117467481,37.31%,117467481,37.31%
1,West,74418552,23.64%,191886033,60.95%
2,Midwest,65736175,20.88%,257622208,81.84%
3,Northeast,57179654,18.16%,314801862,100.00%


###  building a simple missing values table

In [31]:
df.stb.missing()

Unnamed: 0,missing,total,percent
Region,10,757,1.32
School Name,0,757,0.0
City,0,757,0.0
State,0,757,0.0
District Name,0,757,0.0
Model Selected,0,757,0.0
Award_Amount,0,757,0.0


### one way to look at sidetable is that it is an expanded version of a crosstab with some convenience functions to view the data more easily

In [33]:
crosstabbed=pd.crosstab(df['Region'], df['Model Selected'], values=df['Award_Amount'], aggfunc='sum')
crosstabbed

Model Selected,Closure,Restart,Transformation,Turnaround
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwest,86872,1397735,48702505,15549063
Northeast,508773,5728010,41263161,9679710
South,354323,5901714,88680032,22531412
West,272520,2245146,56207890,15692996


In [38]:
crosstabbed.applymap(lambda x:f'{x:,d}')

Model Selected,Closure,Restart,Transformation,Turnaround
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwest,86872,1397735,48702505,15549063
Northeast,508773,5728010,41263161,9679710
South,354323,5901714,88680032,22531412
West,272520,2245146,56207890,15692996
