In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [2]:
# groupby mechanics
"""
Hadley Wickham, an author of many popular packages for the R programming language,
coined the term split-apply-combine for describing group operations. In the
first stage of the process, data contained in a pandas object, whether a Series, Data‐
Frame, or otherwise, is split into groups based on one or more keys that you provide.
The splitting is performed on a particular axis of an object. For example, a DataFrame
can be grouped on its rows (axis=0) or its columns (axis=1). Once this is done, a
function is applied to each group, producing a new value. Finally, the results of all
those function applications are combined into a result object. The form of the resulting
object will usually depend on what’s being done to the data. See Figure 10-1 for a
mockup of a simple group aggregation.
"""
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df
# suppose you wanted to compute the mean of the data1 column using the labels from key1
grouped = df['data1'].groupby(df['key1'])
grouped
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
# Here we grouped the data using two keys, and the resulting Series now has a hierarchical
# index consisting of the unique pairs of keys observed:
means.unstack()
# group keys could be any arrays of the right length
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

'\nHadley Wickham, an author of many popular packages for the R programming language,\ncoined the term split-apply-combine for describing group operations. In the\nfirst stage of the process, data contained in a pandas object, whether a Series, Data‐\nFrame, or otherwise, is split into groups based on one or more keys that you provide.\nThe splitting is performed on a particular axis of an object. For example, a DataFrame\ncan be grouped on its rows (axis=0) or its columns (axis=1). Once this is done, a\nfunction is applied to each group, producing a new value. Finally, the results of all\nthose function applications are combined into a result object. The form of the resulting\nobject will usually depend on what’s being done to the data. See Figure 10-1 for a\nmockup of a simple group aggregation.\n'

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.884562,-0.069589
1,a,two,0.061619,-0.293185
2,b,one,0.862687,-0.561847
3,b,two,-0.578451,0.663522
4,a,one,1.079134,0.228102


<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001B0ACECFA20>

key1
a   -0.247937
b    0.142118
Name: data1, dtype: float64

key1  key2
a     one    -0.402714
      two     0.061619
b     one     0.862687
      two    -0.578451
Name: data1, dtype: float64

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.402714,0.061619
b,0.862687,-0.578451


California  2005    0.061619
            2006    0.862687
Ohio        2005   -1.231507
            2006    1.079134
Name: data1, dtype: float64

In [3]:
# Frequently the grouping information is found in the same DataFrame as the data you want to work on.
# pass column names as the group keys
df
df.groupby('key1').mean()
# all the numeric columns are aggregated, so 'key2' column is excluded from the result
df.groupby(['key1', 'key2']).sum()
df.groupby(['key1', 'key2']).size()

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.884562,-0.069589
1,a,two,0.061619,-0.293185
2,b,one,0.862687,-0.561847
3,b,two,-0.578451,0.663522
4,a,one,1.079134,0.228102


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.247937,-0.044891
b,0.142118,0.050838


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.805428,0.158513
a,two,0.061619,-0.293185
b,one,0.862687,-0.561847
b,two,-0.578451,0.663522


key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [4]:
# iterating over groups
# the GroupBy object supports iteration, generating a sequence of 2-tuples containing the group name along with the chunk of data
for name, group in df.groupby('key1'):
    print(name)
    print(group)
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
pieces = dict(list(df.groupby('key1')))
pieces['b']
type(pieces['b'])

a
  key1 key2     data1     data2
0    a  one -1.884562 -0.069589
1    a  two  0.061619 -0.293185
4    a  one  1.079134  0.228102
b
  key1 key2     data1     data2
2    b  one  0.862687 -0.561847
3    b  two -0.578451  0.663522
('a', 'one')
  key1 key2     data1     data2
0    a  one -1.884562 -0.069589
4    a  one  1.079134  0.228102
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.061619 -0.293185
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.862687 -0.561847
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.578451  0.663522


Unnamed: 0,key1,key2,data1,data2
2,b,one,0.862687,-0.561847
3,b,two,-0.578451,0.663522


pandas.core.frame.DataFrame

In [5]:
# by default groupby groups on axis=0, but you can group on any of the other axes, for ex: we could group the columns of out example by dtype
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)


key1      object
key2      object
data1    float64
data2    float64
dtype: object

float64
      data1     data2
0 -1.884562 -0.069589
1  0.061619 -0.293185
2  0.862687 -0.561847
3 -0.578451  0.663522
4  1.079134  0.228102
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [6]:
# selecting a column or subset of columns
# The object returned by this indexing operation is a grouped DataFrame if a list or
# array is passed or a grouped Series if only a single column name is passed as a scalar
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df.groupby(['key1', 'key2'])[['data2']].mean()
df.groupby(['key1', 'key2'])['data2'].mean()    # Series with a hierarchical indexing
df.groupby(['key1', 'key2'])['data2'].mean().unstack()

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001B0BF26D358>

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001B0BF26D6A0>

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.079256
a,two,-0.293185
b,one,-0.561847
b,two,0.663522


key1  key2
a     one     0.079256
      two    -0.293185
b     one    -0.561847
      two     0.663522
Name: data2, dtype: float64

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.079256,-0.293185
b,-0.561847,0.663522


In [7]:
# grouping with dicts and series
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people
# Now, suppose I have a group correspondence for the columns and want to sum together the columns by group
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()
map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Unnamed: 0,a,b,c,d,e
Joe,-0.272052,1.114006,-0.717385,0.0124,2.295058
Steve,-1.748421,0.017406,-0.009816,0.839375,0.400156
Wes,-0.047105,,,-1.724361,0.187308
Jim,-0.424527,2.5165,1.007408,-0.238226,-0.160591
Travis,-0.578827,1.010006,0.174218,-0.458801,0.458296


Unnamed: 0,blue,red
Joe,-0.704986,3.137012
Steve,0.829558,-1.330859
Wes,-1.724361,0.140203
Jim,0.769182,1.931382
Travis,-0.284583,0.889476


a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [8]:
# group with functions
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.743683,3.630506,0.290023,-1.950187,2.321775
5,-1.748421,0.017406,-0.009816,0.839375,0.400156
6,-0.578827,1.010006,0.174218,-0.458801,0.458296


In [9]:
# grouping by index levels
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
hier_df.groupby(level='cty', axis=1).count()
hier_df.groupby(level='tenor', axis=1).sum()

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.263334,0.50459,-0.92398,-1.786098,1.351311
1,1.089289,0.051636,-0.150884,1.367216,-0.947831
2,1.127189,-1.371314,0.570892,-0.797458,0.194925
3,-0.368907,0.534797,-1.451133,0.335597,1.808383


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


tenor,1,3,5
0,-3.049433,1.8559,-0.92398
1,2.456504,-0.896195,-0.150884
2,0.329731,-1.176389,0.570892
3,-0.03331,2.34318,-1.451133


In [10]:
# Data Aggregation
df
grouped = df.groupby('key1')
grouped['data1'].sum()
grouped['data1'].quantile(0.9)
# To use your own aggregation functions, pass any function that aggregates an array to the aggregate or agg method
def peak_to_peak(arr):
    return arr.max() - arr.min()


grouped['data1'].agg(peak_to_peak)
grouped.describe()

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.884562,-0.069589
1,a,two,0.061619,-0.293185
2,b,one,0.862687,-0.561847
3,b,two,-0.578451,0.663522
4,a,one,1.079134,0.228102


key1
a   -0.743810
b    0.284236
Name: data1, dtype: float64

key1
a    0.875631
b    0.718573
Name: data1, dtype: float64

key1
a    2.963697
b    1.441138
Name: data1, dtype: float64

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.247937,1.505903,-1.884562,-0.911472,0.061619,0.570376,1.079134,3.0,-0.044891,0.261519,-0.293185,-0.181387,-0.069589,0.079256,0.228102
b,2.0,0.142118,1.019038,-0.578451,-0.218167,0.142118,0.502402,0.862687,2.0,0.050838,0.866466,-0.561847,-0.255504,0.050838,0.35718,0.663522


In [11]:
# column-wise and multiple function application
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
grouped = tips.groupby(['day', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
grouped_pct.agg(['mean', 'std', peak_to_peak])  # pass a list of functions, function name as the column name
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])  # pass a list of functions and change name


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [12]:
grouped.agg({'tip': np.max, 'size': 'sum'})
grouped.agg({'tip_pct': ['min', 'max', 'mean', 'std'],
             'size': 'sum'})



Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [13]:
# returning aggregated data without row indexes
tips.groupby(['day', 'smoker'], as_index=False).mean()
tips.groupby(['day', 'smoker']).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [19]:
"""
The most general-purpose GroupBy method is apply, which is the subject of the rest
of this section. As illustrated in Figure 10-2, apply splits the object being manipulated
into pieces, invokes the passed function on each piece, and then attempts to concatenate
the pieces together.
"""
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]


top(tips, n=6)
tips.groupby('smoker').apply(top)
"""
the top function is called on each row group from the DataFrame, then the results are glued together using pandas.concat
"""
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
result = tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')
# suppressing group keys
tips.groupby('smoker', group_keys=False).apply(top)

'\nThe most general-purpose GroupBy method is apply, which is the subject of the rest\nof this section. As illustrated in Figure 10-2, apply splits the object being manipulated\ninto pieces, invokes the passed function on each piece, and then attempts to concatenate\nthe pieces together.\n'

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


'\nthe top function is called on each row group from the DataFrame, then the results are glued together using pandas.concat\n'

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [24]:
# quantile and bucket analysis
# slice data up into buckets with bins
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]


def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}


grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
# these were equal-length buckets
# qcut to compute equal-size buckets
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

0    (-1.479, 0.0931]
1    (-3.058, -1.479]
2    (-1.479, 0.0931]
3    (-1.479, 0.0931]
4    (-1.479, 0.0931]
5    (-1.479, 0.0931]
6    (-3.058, -1.479]
7    (-1.479, 0.0931]
8     (0.0931, 1.666]
9    (-1.479, 0.0931]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.058, -1.479] < (-1.479, 0.0931] < (0.0931, 1.666] < (1.666, 3.238]]

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.058, -1.479]",68.0,1.877046,-0.086183,-2.681646
"(-1.479, 0.0931]",477.0,2.767757,-0.065827,-3.093511
"(0.0931, 1.666]",408.0,2.903425,-0.032802,-3.428837
"(1.666, 3.238]",47.0,2.056882,-0.243854,-2.26424


Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.124823,-0.075581,-2.681646
1,100.0,2.223855,-0.145973,-3.093511
2,100.0,2.767757,-0.025191,-2.976334
3,100.0,2.328294,-0.030459,-2.675267
4,100.0,2.332634,-0.0474,-2.404348
5,100.0,2.545211,-0.226247,-2.531758
6,100.0,2.565502,0.077436,-3.428837
7,100.0,1.932389,-0.028386,-2.956624
8,100.0,2.903425,-0.000322,-2.719489
9,100.0,2.056882,-0.11892,-2.26424


In [32]:
# example: filling missing values with group-specific values
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s
s.fillna(s.mean())
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
data.groupby(group_key).mean()
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

0         NaN
1    0.893306
2         NaN
3    0.866907
4         NaN
5    0.483354
dtype: float64

0    0.747856
1    0.893306
2    0.747856
3    0.866907
4    0.747856
5    0.483354
dtype: float64

Ohio          0.863587
New York      0.029725
Vermont       0.745976
Florida       2.327391
Oregon        1.145853
Nevada        0.212859
California   -0.743920
Idaho        -0.703130
dtype: float64

Ohio          0.863587
New York      0.029725
Vermont            NaN
Florida       2.327391
Oregon        1.145853
Nevada             NaN
California   -0.743920
Idaho              NaN
dtype: float64

East    1.073568
West    0.200966
dtype: float64

Ohio          0.863587
New York      0.029725
Vermont       1.073568
Florida       2.327391
Oregon        1.145853
Nevada        0.200966
California   -0.743920
Idaho         0.200966
dtype: float64

Ohio          0.863587
New York      0.029725
Vermont       0.500000
Florida       2.327391
Oregon        1.145853
Nevada       -1.000000
California   -0.743920
Idaho        -1.000000
dtype: float64

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [7]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)
deck[:13]

def draw(deck, n=5):
    return deck.sample(n)


draw(deck)
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2)


AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

8D    8
4H    4
8H    8
2C    2
7H    7
dtype: int64

C  9C      9
   2C      2
D  10D    10
   QD     10
H  4H      4
   2H      2
S  QS     10
   6S      6
dtype: int64

In [9]:
# group weighted average and correlation
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)


Unnamed: 0,category,data,weights
0,a,-2.136617,0.548933
1,a,-0.525955,0.747061
2,a,1.194383,0.446415
3,a,-1.046036,0.780008
4,b,-0.739207,0.419755
5,b,0.233295,0.916794
6,b,-0.151551,0.237077
7,b,1.756796,0.579137


category
a   -0.732832
b    0.411143
dtype: float64

In [19]:
# pivot tables and cross-Tabulation
"""
A pivot table is a data summarization tool frequently found in spreadsheet programs
and other data analysis software. It aggregates a table of data by one or more keys,
arranging the data in a rectangle with some of the group keys along the rows and
some along the columns. Pivot tables in Python with pandas are made possible
through the groupby facility described in this chapter combined with reshape operations
utilizing hierarchical indexing. DataFrame has a pivot_table method, and
there is also a top-level pandas.pivot_table function. In addition to providing a
convenience interface to groupby, pivot_table can add partial totals, also known as
margins.
"""
tips = pd.read_csv('examples/tips.csv')
tips.pivot_table(index=['day', 'smoker'])
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'], columns='smoker')
# We could augment this table to include partial totals by passing margins=True
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'], columns='smoker', margins=True)
# aggfunc= to use different aggregation functions
tips.pivot_table(['tip_pct'], index=['time', 'smoker'], columns='day', aggfunc='count', margins=True)
tips.pivot_table(['tip_pct'], index=['time', 'smoker', 'size'], columns='day', aggfunc='mean', fill_value=0)

'\nA pivot table is a data summarization tool frequently found in spreadsheet programs\nand other data analysis software. It aggregates a table of data by one or more keys,\narranging the data in a rectangle with some of the group keys along the rows and\nsome along the columns. Pivot tables in Python with pandas are made possible\nthrough the groupby facility described in this chapter combined with reshape operations\nutilizing hierarchical indexing. DataFrame has a pivot_table method, and\nthere is also a top-level pandas.pivot_table function. In addition to providing a\nconvenience interface to groupby, pivot_table can add partial totals, also known as\nmargins.\n'

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,2.25,2.8125,18.42
Fri,Yes,2.066667,2.714,16.813333
Sat,No,2.555556,3.102889,19.661778
Sat,Yes,2.47619,2.875476,21.276667
Sun,No,2.929825,3.167895,20.506667
Sun,Yes,2.578947,3.516842,24.12
Thur,No,2.488889,2.673778,17.113111
Thur,Yes,2.352941,3.03,19.190588


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Dinner,No,3.0,45.0,57.0,1.0,106
Dinner,Yes,9.0,42.0,19.0,,70
Lunch,No,1.0,,,44.0,45
Lunch,Yes,6.0,,,17.0,23
All,,19.0,87.0,76.0,62.0,244


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,day,Fri,Sat,Sun,Thur
time,smoker,size,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Dinner,No,1,0.0,0.137931,0.0,0.0
Dinner,No,2,0.139622,0.162705,0.168859,0.159744
Dinner,No,3,0.0,0.154661,0.152663,0.0
Dinner,No,4,0.0,0.150096,0.148143,0.0
Dinner,No,5,0.0,0.0,0.206928,0.0
Dinner,No,6,0.0,0.0,0.103799,0.0
Dinner,Yes,1,0.0,0.325733,0.0,0.0
Dinner,Yes,2,0.171297,0.148668,0.207893,0.0
Dinner,Yes,3,0.0,0.144995,0.15266,0.0
Dinner,Yes,4,0.11775,0.124515,0.19337,0.0


In [23]:
# crosstab
"""
A cross-tabulation (or crosstab for short) is a special case of a pivot table that computes group frequencies
"""
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

'\nA cross-tabulation (or crosstab for short) is a special case of a pivot table that computes group frequencies\n'

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
