# Data Aggregation and Group Operations
## DAT540 Introduction to Data Science
## University of Stavanger
### L12
#### Antorweep Chakravorty (antorweep.chakravorty@uis.no)

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns

- Categorizing a dataset and applying a function to each group, whether an aggregation or transformation, is a critical component of data analysis workflow
- After loading, merging and preparing a dataset, group stats or pivot tables for reporting or visualization is essential.
- pandas offers a flexible *groupby* interface that enables slicing, dicing and summarizing datasets in a natural way
  - Split a pandas object into pieces using one or more keys
  - Calculate group summary statistics, like count, mean, or standard deviation or a user defined function
  - Apply within-group transformations or other manipulations, like normalization, linear regression, rank, or subset selection
  - Perform quantile analysis and other statistical group analyses

- **GroupBy Mechanics**
- *split-apply-combine* describes group operations
  - split: data in a pandas object (Series, DataFrame) is split into groups based on one or more provided keys
    - The split happens on a particular axis of an object
  - apply: After splitting an object into chunks, a function is applied to each group producing a new value
  - combine: Once a function is applied to the chunks the results are combined into a result object
    - The form of the resulting object will depend on the applied transformation on the data

<img src='images/group_ag.png' width='350'>

- Each grouping can take many forms and the keys do not have to be all of the same type:
  - A list or array of values that is the same length as the axis being grouped
  - A value indicating a column name in a DataFrame
  - a dict or Series giving a correspondence between values on the axis being grouped and the group names
  - A function to be invoked on the axis index or the individual labels in the index
- The *as_index* boolean argument can be set to False to disable the grouping keys represented as index and have them as column values

In [10]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df
# Let us calculate the mean over key1

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.573329,1.028183
1,a,two,1.191752,-1.236504
2,b,one,-1.346913,-0.319765
3,b,two,0.142471,1.85095
4,a,one,-0.555388,0.877027


In [11]:
# index
h_indexed = df.set_index(['key1'])
h_indexed

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.573329,1.028183
a,two,1.191752,-1.236504
b,one,-1.346913,-0.319765
b,two,0.142471,1.85095
a,one,-0.555388,0.877027


In [12]:
# getting the mean
h_indexed.mean(level='key1').head()

  h_indexed.mean(level='key1').head()
  return self._agg_by_level(


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.403231,0.222902
b,-0.602221,0.765593


In [13]:
# Using grouping
grouped = df.groupby(df['key1']) # Grouping based on a series
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd6015506d0>

In [14]:
grouped.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.403231,0.222902
b,-0.602221,0.765593


In [15]:
# We can also groupby using multiple keys/series
# Using grouping
grouped = df.groupby([df['key1'], df['key2']])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd5d0b47430>

In [16]:
grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.008971,0.952605
a,two,1.191752,-1.236504
b,one,-1.346913,-0.319765
b,two,0.142471,1.85095


In [17]:
# Using hierarchical index
h_indexed = df.set_index(['key1', 'key2'])
# getting the mean
h_indexed.mean(level=['key1', 'key2'])

  h_indexed.mean(level=['key1', 'key2'])


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.008971,0.952605
a,two,1.191752,-1.236504
b,one,-1.346913,-0.319765
b,two,0.142471,1.85095


In [18]:
# Whats the difference in performance between hierarchical index and groupby
#size
n = 10000000
# Use map to create lowercase alphabet
alphabet = [chr(c) for c in range(97,123)]
key1 = np.random.choice(alphabet, n)
key1[:10]

array(['h', 'l', 'g', 'e', 'e', 'h', 'm', 't', 'p', 'e'], dtype='<U1')

In [19]:
key2 = ['k-'+ str(i) for i in np.random.randint(0, 100, n)]
key2[:10]

['k-31',
 'k-41',
 'k-56',
 'k-21',
 'k-45',
 'k-12',
 'k-35',
 'k-64',
 'k-58',
 'k-11']

In [20]:
df = pd.DataFrame({'key1': key1,
                  'key2': key2,
                  'data1': np.random.randn(n),
                  'data2': np.random.randn(n)})
df.shape

(10000000, 4)

In [21]:
%%time
# Using hierarchical index
h_indexed = df.set_index(['key1', 'key2'])
# getting the mean
h_indexed.mean(level=['key1', 'key2'])



CPU times: user 1.32 s, sys: 380 ms, total: 1.7 s
Wall time: 1.75 s


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
h,k-31,-0.018527,0.001967
l,k-41,0.001478,-0.035679
g,k-56,0.008779,0.008245
e,k-21,0.000702,-0.021857
e,k-45,0.012003,-0.000605
...,...,...,...
c,k-86,0.007216,0.005402
y,k-97,0.021754,0.000881
o,k-44,-0.010390,-0.006987
f,k-20,-0.006534,0.021150


In [22]:
%%time
# We can also groupby using multiple keys/series
# Using grouping
grouped = df.groupby([df['key1'], df['key2']]) # Grouping based on a series
grouped.mean()

CPU times: user 930 ms, sys: 105 ms, total: 1.04 s
Wall time: 1.07 s


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,k-0,0.015560,-0.012892
a,k-1,0.019011,0.008819
a,k-10,-0.024698,0.000649
a,k-11,0.034488,-0.014345
a,k-12,0.003804,-0.017826
...,...,...,...
z,k-95,0.017820,0.006372
z,k-96,-0.002710,0.002345
z,k-97,0.005238,-0.008751
z,k-98,0.002036,-0.031913


In [23]:
# The size instance method returns a Series containing the size of each group
grouped.size().head()

key1  key2
a     k-0     3811
      k-1     3789
      k-10    3902
      k-11    3823
      k-12    3898
dtype: int64

In [24]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df.shape

(5, 4)

In [25]:
# Performing correlation
grouped = df.groupby('key1')
grouped.corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,data1,1.0,-0.998416
a,data2,-0.998416,1.0
b,data1,1.0,-1.0
b,data2,-1.0,1.0


- *Iterating Over Groups*
- The GroupBy object supports iterations, generating a sequence of 2-tuples containing the group name along with the chunk of data

In [26]:
# Iterating over a df group having a single key
for name, group in df.groupby('key1'):
  print('[name:', name, ']\n')
  print('[group:', type(group), ']\n\n\n')

[name: a ]

[group: <class 'pandas.core.frame.DataFrame'> ]



[name: b ]

[group: <class 'pandas.core.frame.DataFrame'> ]





In [27]:
# Iterating over a df group having multiple keys
for (k1, k2), group in df.groupby(['key1', 'key2']):
  print('[k1:', k1, 'k2:', k2, ']\n')
  print('[group:', group, ']\n\n\n')

[k1: a k2: one ]

[group:   key1 key2     data1     data2
0    a  one -0.906246  1.620874
4    a  one -0.117601  0.779376 ]



[k1: a k2: two ]

[group:   key1 key2    data1     data2
1    a  two  0.40094  0.332486 ]



[k1: b k2: one ]

[group:   key1 key2    data1     data2
2    b  one  0.78583 -0.743495 ]



[k1: b k2: two ]

[group:   key1 key2     data1     data2
3    b  two  0.745458  0.731622 ]





- Alternatively, a group can be converted into a dict for direct access
- By default axis=0, but it can be changed to any axis

In [28]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.78583,-0.743495
3,b,two,0.745458,0.731622


In [29]:
# Grouping on columns by *dtype* over axis=1
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [30]:
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.906246  1.620874
1  0.400940  0.332486
2  0.785830 -0.743495
3  0.745458  0.731622
4 -0.117601  0.779376
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


- *Selecting a Column or Subset of Columns*
- Indexing a GroupBy object created from a DataFrame with column name or array of column names 
  - has the effect of column sub-setting for aggregation
  - especially for large datasets, it may be desirable to aggregate only a few columns
  - the object returned by the indexing operation is a 
    - grouped DataFrame if a list or array is passed 
    - grouped Series if only as single column name is passed as a scalar

In [31]:
df.groupby('key1')['data1'] # SeriesGroupBy. Alternatively df['data1'].groupby(df['key1'])
df.groupby('key1')[['data1']] # DataFrameGroupBy. Alternatively df[['data1']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd5d0b47a30>

In [32]:
# Computing the mean and returning a DataFrame
data = df[['data1']].groupby(df['key1'])
type(data.mean())

pandas.core.frame.DataFrame

In [33]:
# Computing the mean and returing a Series
data = df['data1'].groupby(df['key1'])
type(data.mean())

pandas.core.series.Series

- *Grouping with Dict and Series*
- Grouping information may exist in a form other than an array

In [34]:
people = pd.DataFrame(np.random.randn(5, 5,),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1,2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,0.70878,-0.728279,1.734172,0.344195,0.597394
Steve,-0.713212,2.087803,0.6642,-0.167204,1.653191
Wes,0.282498,,,-1.313345,-1.498067
Jim,0.491976,0.716298,0.92576,-0.616878,0.603983
Travis,-1.049785,1.270064,-0.979073,0.711607,-1.748057


In [35]:
# Suppose we have a seperate dict that maps the columns name of people to another value
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
          'd': 'blue', 'e':'red', 'f':'orange'}

In [36]:
# We can instead use this new mapping to group the dataframe people accordingly
by_col = people.groupby(mapping, axis=1)
# In this case we sum columns based on their color representation
by_col.sum()

Unnamed: 0,blue,red
Joe,2.078367,0.577895
Steve,0.496996,3.027783
Wes,-1.313345,-1.215569
Jim,0.308882,1.812257
Travis,-0.267466,-1.527778


In [37]:
# The same functionality also holds true for Series.
# In case of Series the dict keys will be the row indices
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [38]:
# When we use the Series to group the DataFrame people, the row indices will be aligned
by_ser = people.groupby(map_series, axis=1)
by_ser.sum() 

Unnamed: 0,blue,red
Joe,2.078367,0.577895
Steve,0.496996,3.027783
Wes,-1.313345,-1.215569
Jim,0.308882,1.812257
Travis,-0.267466,-1.527778


- *Grouping with Functions*
- Using python functions is a more generic way of defining group mappings
- Any function passed as a group key will be called once per index with the return value used as group names
- Additionally, functions, dicts, Series all can be combined to create a multi key grouping

In [39]:
# Grouping people DataFrame by the length of names
people

Unnamed: 0,a,b,c,d,e
Joe,0.70878,-0.728279,1.734172,0.344195,0.597394
Steve,-0.713212,2.087803,0.6642,-0.167204,1.653191
Wes,0.282498,,,-1.313345,-1.498067
Jim,0.491976,0.716298,0.92576,-0.616878,0.603983
Travis,-1.049785,1.270064,-0.979073,0.711607,-1.748057


In [40]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.483254,-0.011982,2.659932,-1.586029,-0.29669
5,-0.713212,2.087803,0.6642,-0.167204,1.653191
6,-1.049785,1.270064,-0.979073,0.711607,-1.748057


In [41]:
# Mixing function, list for grouping
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.282498,-0.728279,1.734172,-1.313345,-1.498067
3,two,0.491976,0.716298,0.92576,-0.616878,0.603983
5,one,-0.713212,2.087803,0.6642,-0.167204,1.653191
6,two,-1.049785,1.270064,-0.979073,0.711607,-1.748057


- *Grouping by Index Levels*
- Aggregating using one of the levels of an axis index for hierarchically indexed datasets

In [42]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.771696,0.443243,-0.017367,0.576576,-0.75007
1,0.313527,-2.397293,0.730902,0.20799,-0.400136
2,-0.468912,-1.26766,-0.776434,-0.151177,1.081219
3,-1.301254,-1.306792,0.450797,-0.77953,0.64655


In [43]:
# To groupby by level, pass the level number or name using *level* keyword
hier_df.groupby(level=0, axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


- **Advanced GroupBy Use**
- *Group Transformation and "Unwrapped" GroupBys*
- *transform* is an instance method available to grouped objects
- It is similar to apply but imposes more constrains on the kind of function that can be used
  - It can produce a scalar value to be broadcasted to the shape if the group
  - It can produce an object of the same shape as the input group
  - It must not mutate its input

In [44]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                 'value': np.arange(12.)})
df.shape

(12, 2)

In [45]:
g = df['value'].groupby(df['key'])
type(g)

pandas.core.groupby.generic.SeriesGroupBy

In [46]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [47]:
# Suppose we want to produce a Series of the same shape as df['value'] but with values replaced by the average grouped by 'key'
g.transform(lambda x: x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

- Alternatively, we can also pass built-in aggregation functions. e.g.: ```g.transform('mean')```
- - Like apply, transform works with functions that returns Series, but the result must be the *same size* as the input
- Example use cases:
  - Compute ranks in descending order for each group
  - Transform values to compute the norm or normalize each value

In [48]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [49]:
# Normalizing
def normalize(x):
  return (x - x.mean()) / x.std()

In [50]:
g = df.groupby('key').value
type(g)

pandas.core.groupby.generic.SeriesGroupBy

In [51]:
g.transform(normalize) # g.apply would yield the same result. So why do we need it

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

- Built-in aggregate functions like *mean* or *sum* are often much faster than a general apply function
- These also have a *"fast past"* when used with transform
- Allowing us to perform so-called *unwrapped* group operations:

In [52]:
(df['value'] - g.transform('mean')) / g.transform('std')

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

- *Grouped Time Resampling*
- For time series data, the *resample* method is semantically a group operation based on a time intervalization

In [53]:
N = 15
times = pd.date_range('2021-10-04 00:00', freq='1min', periods=N)
df = pd.DataFrame({'time': times,
                  'value': np.arange(N)})

# Indexing the Series using the time column
df.set_index('time', inplace=True)
df.shape

(15, 1)

In [54]:
# We groupby / resample over 5 mins and see the count the number of data points
df.resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2021-10-04 00:00:00,5
2021-10-04 00:05:00,5
2021-10-04 00:10:00,5


- **Data Aggregation**
- Aggregation refer to any data transformation that produces scalar values from arrays
- Groupby provides multiple optimized methods for data transformation

<img src='images/group_met.png' width='450'>

- Custom aggregation functions can also be used
  - Any function that aggregates an array can be passed to the instance *aggregate* or *agg* method of group object

In [55]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                 'value': np.random.randint(0, 100, 12)})
df.shape

(12, 2)

In [56]:
grouped = df.groupby('key')
grouped.agg(lambda x: x.max() - x.min())

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
a,97
b,84
c,89


In [57]:
# Basic information of a grouped object can be retrieved using the describe instance method
grouped.describe()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
a,4.0,46.5,40.616089,2.0,25.25,42.5,63.75,99.0
b,4.0,47.25,40.598645,7.0,16.0,45.5,76.75,91.0
c,4.0,44.25,39.016022,6.0,18.75,38.0,63.5,95.0


- *Column-Wise and Multiple Function Application*
- A list of functions can be passed to *agg* to apply each 

In [58]:
# Seaborn package has a set of sample datasets: https://github.com/mwaskom/seaborn-data
# We can load them as a pandas DataFram
tips = sns.load_dataset('tips')

# Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']

print('shape: ', tips.shape)
tips.head()

shape:  (244, 8)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [59]:
# Group by day and sex
grouped = tips.groupby(['day', 'sex'])

In [61]:
# Checking the mean tip percentage 
grouped['tip_pct'].mean() 

day   sex   
Thur  Male      0.165276
      Female    0.157525
Fri   Male      0.143385
      Female    0.199388
Sat   Male      0.151577
      Female    0.156470
Sun   Male      0.162344
      Female    0.181569
Name: tip_pct, dtype: float64

In [62]:
# Computing multiple measures over tip percentage
# when passing a lambda function, the column name is returned as lambda
#  in order to specify a column name for the result series of a lambda, 
#  we pass a tuple (name: string, lambda: function) to the aggregate method

grouped['tip_pct'].agg(['mean', ('P2P', lambda x: x.max() - x.min()), 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,P2P,std
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,0.165276,0.187695,0.045866
Thur,Female,0.157525,0.138548,0.030689
Fri,Male,0.143385,0.120221,0.036228
Fri,Female,0.199388,0.120623,0.04217
Sat,Male,0.151577,0.256352,0.046944
Sat,Female,0.15647,0.269299,0.060243
Sun,Male,0.162344,0.644685,0.088529
Sun,Female,0.181569,0.35722,0.07143


In [63]:
# Alternativelty a list of function can also be passed 
functions = [('avg', 'mean'), ('p2p', lambda x: x.max() - x.min()), ('sd', 'std')]
grouped[['tip_pct', 'total_bill']].agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,p2p,sd,avg,p2p,sd
day,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Thur,Male,0.165276,0.187695,0.045866,18.714667,33.68,8.019728
Thur,Female,0.157525,0.138548,0.030689,16.715312,34.76,7.759764
Fri,Male,0.143385,0.120221,0.036228,19.857,31.59,10.015847
Fri,Female,0.199388,0.120623,0.04217,14.145556,17.0,4.788547
Sat,Male,0.151577,0.256352,0.046944,20.802542,43.07,9.836306
Sat,Female,0.15647,0.269299,0.060243,19.680357,41.23,8.80647
Sun,Male,0.162344,0.644685,0.088529,21.887241,40.92,9.129142
Sun,Female,0.181569,0.35722,0.07143,19.872222,25.66,7.837513


- **Apply General split-apply-combine**
- *apply* is the most general-purpose GroupBy method
- apply splits the object being manipulated into pieces
- invokes the passed function on each piece
- then attempts to concatenate the pieces together

In [64]:
# A function select the rows with the largest values in a particular column
def top(df, n=5, column='tip_pct'):
  return df.sort_values(by=column)[-n:]
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [65]:
# Grouping by sex and applying top
# The top function is called on each row group from the DataFrame, 
#  and then the results are glued together using pandas.concat,
#  labeling the pieces with the group names
tips.groupby('sex').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Male,181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177
Male,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
Male,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Male,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Male,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
Female,221,13.42,3.48,Female,Yes,Fri,Lunch,2,0.259314
Female,93,16.32,4.3,Female,Yes,Fri,Dinner,2,0.26348
Female,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Female,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Female,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667


In [80]:
# Alternatively we can augment the default argument values for top
tips.groupby(['sex', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
sex,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Male,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Male,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Male,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Male,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
Female,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982
Female,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
Female,Sat,102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
Female,Sun,11,35.26,5.0,Female,No,Sun,Dinner,4,0.141804


In [67]:
# We could also get descriptive stats for each group using apply
tips[['sex', 'tip_pct']].groupby(['sex']).apply(lambda x: x.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,count,157.0
Male,mean,0.157651
Male,std,0.064778
Male,min,0.035638
Male,25%,0.121389
Male,50%,0.153492
Male,75%,0.18624
Male,max,0.710345
Female,count,87.0
Female,mean,0.166491


In [69]:
# Disable grouping keys as index
tips.groupby(['smoker'], as_index=False).apply(top)

Unnamed: 0,Unnamed: 1,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
0,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
0,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
0,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
0,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
1,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
1,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
1,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
1,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
1,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [70]:
# Disable grouping keys
tips.groupby(['sex'], group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
221,13.42,3.48,Female,Yes,Fri,Lunch,2,0.259314
93,16.32,4.3,Female,Yes,Fri,Dinner,2,0.26348
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667


- *Quantile and Bucket Analysis*
- *cut* and *qcut* are tools in pandas for slicing data into buckets / bins
- These functionalities can be combined with *groupby* to perform bucket or quantile analysis on a dataset
- Categorical object returned by cut / qcut can be passed directly to groupby

In [71]:
df = pd.DataFrame({'data1': np.random.randn(1000),
                  'data2': np.random.randn(1000)})
quartiles = pd.cut(df.data1, 4)
quartiles.value_counts()

(-1.551, 0.0619]    464
(0.0619, 1.675]     422
(-3.17, -1.551]      66
(1.675, 3.287]       48
Name: data1, dtype: int64

In [72]:
grouped = df.data2.groupby(quartiles)

In [73]:
grouped.agg(['min', 'max', 'count', 'mean']).unstack()

       data1           
min    (-3.17, -1.551]      -2.250510
       (-1.551, 0.0619]     -2.777683
       (0.0619, 1.675]      -3.198538
       (1.675, 3.287]       -1.861298
max    (-3.17, -1.551]       2.102095
       (-1.551, 0.0619]      3.031695
       (0.0619, 1.675]       2.645037
       (1.675, 3.287]        2.510545
count  (-3.17, -1.551]      66.000000
       (-1.551, 0.0619]    464.000000
       (0.0619, 1.675]     422.000000
       (1.675, 3.287]       48.000000
mean   (-3.17, -1.551]      -0.001470
       (-1.551, 0.0619]     -0.092988
       (0.0619, 1.675]       0.074376
       (1.675, 3.287]        0.154933
dtype: float64

- **Pivot Tables and Cross-Tabulation**
- A *pivot table* is a data summarization tool
- It aggregates a table of data by one or more keys
- It produces a result with some of the group keys along the rows and other along the columns
- Pivot tables in Python with pandas are enabled through *groupby* facility combined with *reshape* operations utilizing hierarchical indexing
- DataFrame has a *pivot_table* instance method along with the *pd.pivot_table* top-level pandas method
- pivot tables can also use partial totals called *margins*

In [74]:
# Computing the group mean of tips DataFrame
#  The default aggregation type for the pivot table is arranged in this case by day and smoker
tips.pivot_table(index=['day', 'sex'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thur,Male,2.433333,2.980333,0.165276,18.714667
Thur,Female,2.46875,2.575625,0.157525,16.715312
Fri,Male,2.1,2.693,0.143385,19.857
Fri,Female,2.111111,2.781111,0.199388,14.145556
Sat,Male,2.644068,3.083898,0.151577,20.802542
Sat,Female,2.25,2.801786,0.15647,19.680357
Sun,Male,2.810345,3.220345,0.162344,21.887241
Sun,Female,2.944444,3.367222,0.181569,19.872222


In [81]:
# Suppose we want to aggregate over tip_pct and size and index by time and day.
#  We also want to differentiate the results based on sex
## Further more let us aggregate over count, sum, mean 
tips.pivot_table(aggfunc=['count', 'sum', 'mean'], values=['tip_pct', 'size'], columns='sex', index=['time', 'day'])


Unnamed: 0_level_0,Unnamed: 1_level_0,count,count,count,count,sum,sum,sum,sum,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,size,size,tip_pct,tip_pct,size,size,tip_pct,tip_pct,size,size,tip_pct,tip_pct
Unnamed: 0_level_2,sex,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female
time,day,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Dinner,Fri,7,5,7,5,16,10,0.91142,0.995573,2.285714,2.0,0.130203,0.199115
Dinner,Sat,59,28,59,28,156,63,8.943033,4.381166,2.644068,2.25,0.151577,0.15647
Dinner,Sun,58,18,58,18,163,53,9.415956,3.268238,2.810345,2.944444,0.162344,0.181569
Dinner,Thur,0,1,0,1,0,2,0.0,0.159744,,2.0,,0.159744
Lunch,Fri,3,4,3,4,5,9,0.522432,0.798922,1.666667,2.25,0.174144,0.199731
Lunch,Sat,0,0,0,0,0,0,0.0,0.0,,,,
Lunch,Sun,0,0,0,0,0,0,0.0,0.0,,,,
Lunch,Thur,30,31,30,31,73,77,4.958295,4.88105,2.433333,2.483871,0.165276,0.157453


- Pivot table options

<img src='images/pivot_options.png' width=600>

- *Cross-Tabulations: Crosstab*
- A cross-tabulation (or *crosstab*) is a special case of a pivot table that computes group frequencies

In [82]:
pd.crosstab([tips.time, tips.day, tips.smoker], tips.sex, margins=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,sex,Male,Female,All
time,day,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lunch,Thur,Yes,10,7,17
Lunch,Thur,No,20,24,44
Lunch,Fri,Yes,3,3,6
Lunch,Fri,No,0,1,1
Dinner,Thur,No,0,1,1
Dinner,Fri,Yes,5,4,9
Dinner,Fri,No,2,1,3
Dinner,Sat,Yes,27,15,42
Dinner,Sat,No,32,13,45
Dinner,Sun,Yes,15,4,19
