In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [8]:
# groupby mechanics
"""
Hadley Wickham, an author of many popular packages for the R programming language,
coined the term split-apply-combine for describing group operations. In the
first stage of the process, data contained in a pandas object, whether a Series, Data‐
Frame, or otherwise, is split into groups based on one or more keys that you provide.
The splitting is performed on a particular axis of an object. For example, a DataFrame
can be grouped on its rows (axis=0) or its columns (axis=1). Once this is done, a
function is applied to each group, producing a new value. Finally, the results of all
those function applications are combined into a result object. The form of the resulting
object will usually depend on what’s being done to the data. See Figure 10-1 for a
mockup of a simple group aggregation.
"""
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df
# suppose you wanted to compute the mean of the data1 column using the labels from key1
grouped = df['data1'].groupby(df['key1'])
grouped
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
# Here we grouped the data using two keys, and the resulting Series now has a hierarchical
# index consisting of the unique pairs of keys observed:
means.unstack()
# group keys could be any arrays of the right length
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

'\nHadley Wickham, an author of many popular packages for the R programming language,\ncoined the term split-apply-combine for describing group operations. In the\nfirst stage of the process, data contained in a pandas object, whether a Series, Data‐\nFrame, or otherwise, is split into groups based on one or more keys that you provide.\nThe splitting is performed on a particular axis of an object. For example, a DataFrame\ncan be grouped on its rows (axis=0) or its columns (axis=1). Once this is done, a\nfunction is applied to each group, producing a new value. Finally, the results of all\nthose function applications are combined into a result object. The form of the resulting\nobject will usually depend on what’s being done to the data. See Figure 10-1 for a\nmockup of a simple group aggregation.\n'

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.509412,-0.943135
1,a,two,1.21212,-1.945666
2,b,one,0.575787,-1.237088
3,b,two,-0.211076,-0.054871
4,a,one,-0.539419,0.801023


<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001950DB81630>

key1
a    0.394037
b    0.182356
Name: data1, dtype: float64

key1  key2
a     one    -0.015004
      two     1.212120
b     one     0.575787
      two    -0.211076
Name: data1, dtype: float64

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.015004,1.21212
b,0.575787,-0.211076


California  2005    1.212120
            2006    0.575787
Ohio        2005    0.149168
            2006   -0.539419
Name: data1, dtype: float64

In [11]:
# Frequently the grouping information is found in the same DataFrame as the data you want to work on.
# pass column names as the group keys
df
df.groupby('key1').mean()
# all the numeric columns are aggregated, so 'key2' column is excluded from the result
df.groupby(['key1', 'key2']).sum()
df.groupby(['key1', 'key2']).size()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.509412,-0.943135
1,a,two,1.21212,-1.945666
2,b,one,0.575787,-1.237088
3,b,two,-0.211076,-0.054871
4,a,one,-0.539419,0.801023


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.394037,-0.695926
b,0.182356,-0.645979


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.030007,-0.142113
a,two,1.21212,-1.945666
b,one,0.575787,-1.237088
b,two,-0.211076,-0.054871


key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [17]:
# iterating over groups
# the GroupBy object supports iteration, generating a sequence of 2-tuples containing the group name along with the chunk of data
for name, group in df.groupby('key1'):
    print(name)
    print(group)
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
pieces = dict(list(df.groupby('key1')))
pieces['b']
type(pieces['b'])

a
  key1 key2     data1     data2
0    a  one  0.509412 -0.943135
1    a  two  1.212120 -1.945666
4    a  one -0.539419  0.801023
b
  key1 key2     data1     data2
2    b  one  0.575787 -1.237088
3    b  two -0.211076 -0.054871
('a', 'one')
  key1 key2     data1     data2
0    a  one  0.509412 -0.943135
4    a  one -0.539419  0.801023
('a', 'two')
  key1 key2    data1     data2
1    a  two  1.21212 -1.945666
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.575787 -1.237088
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.211076 -0.054871


Unnamed: 0,key1,key2,data1,data2
2,b,one,0.575787,-1.237088
3,b,two,-0.211076,-0.054871


pandas.core.frame.DataFrame

In [18]:
# by default groupby groups on axis=0, but you can group on any of the other axes, for ex: we could group the columns of out example by dtype
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)


key1      object
key2      object
data1    float64
data2    float64
dtype: object

float64
      data1     data2
0  0.509412 -0.943135
1  1.212120 -1.945666
2  0.575787 -1.237088
3 -0.211076 -0.054871
4 -0.539419  0.801023
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [27]:
# selecting a column or subset of columns
# The object returned by this indexing operation is a grouped DataFrame if a list or
# array is passed or a grouped Series if only a single column name is passed as a scalar
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df.groupby(['key1', 'key2'])[['data2']].mean()
df.groupby(['key1', 'key2'])['data2'].mean()    # Series with a hierarchical indexing
df.groupby(['key1', 'key2'])['data2'].mean().unstack()

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001950DBC4E80>

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001950DB818D0>

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.071056
a,two,-1.945666
b,one,-1.237088
b,two,-0.054871


key1  key2
a     one    -0.071056
      two    -1.945666
b     one    -1.237088
      two    -0.054871
Name: data2, dtype: float64

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.071056,-1.945666
b,-1.237088,-0.054871


In [29]:
# grouping with dicts and series
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people
# Now, suppose I have a group correspondence for the columns and want to sum together the columns by group
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Unnamed: 0,a,b,c,d,e
Joe,0.517583,0.616527,0.423045,-1.010504,0.876073
Steve,1.214288,0.670559,-2.831925,-0.883119,-0.132199
Wes,1.165321,,,-1.041807,0.507211
Jim,-0.415641,1.04363,-0.935182,0.496808,0.361499
Travis,0.611651,0.678526,0.913893,1.444755,-2.243905


Unnamed: 0,blue,red
Joe,-0.587459,2.010182
Steve,-3.715044,1.752647
Wes,-1.041807,1.672531
Jim,-0.438374,0.989488
Travis,2.358648,-0.953728


a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3
