In [56]:
import pandas as pd
import numpy as np
df=pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'dat1':[1,2,3,4,5],'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,dat1,data2
0,a,one,1,0.565781
1,a,two,2,-1.606766
2,b,one,3,0.23391
3,b,two,4,1.162502
4,a,one,5,0.770933


In [57]:
df.groupby('key1').mean()

Unnamed: 0_level_0,dat1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.666667,-0.090017
b,3.5,0.698206


In [58]:
df.groupby(['key1','key2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,dat1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,6,1.336714
a,two,2,-1.606766
b,one,3,0.23391
b,two,4,1.162502


###### Finding mean with groupby

In [59]:
df.groupby('key1')['dat1'].mean()

key1
a    2.666667
b    3.500000
Name: dat1, dtype: float64

In [60]:
df['dat1'].groupby(df['key1']).mean()

key1
a    2.666667
b    3.500000
Name: dat1, dtype: float64

The result index has the name 'key1' because the DataFrame column df['key1'] did.

In [61]:
df['dat1'].groupby([df['key1'],df['key2']]).mean()

key1  key2
a     one     3.0
      two     2.0
b     one     3.0
      two     4.0
Name: dat1, dtype: float64

 Here we grouped the data using two keys, and the resulting Series now has a hier
archical index consisting of the unique pairs of keys observed

In [62]:
rslt=df['dat1'].groupby([df['key1'],df['key2']]).mean()
rslt

key1  key2
a     one     3.0
      two     2.0
b     one     3.0
      two     4.0
Name: dat1, dtype: float64

In [63]:
rslt.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.0,2.0
b,3.0,4.0


In this example, the group keys are all Series, though they could be any arrays of the right length:

In [64]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [65]:
df['dat1']

0    1
1    2
2    3
3    4
4    5
Name: dat1, dtype: int64

In [66]:
df['dat1'].groupby([states,years]).mean()

California  2005    2.0
            2006    3.0
Ohio        2005    2.5
            2006    5.0
Name: dat1, dtype: float64

In [67]:
dt=df.rename(columns={'dat1':'data1'})
dt

Unnamed: 0,key1,key2,data1,data2
0,a,one,1,0.565781
1,a,two,2,-1.606766
2,b,one,3,0.23391
3,b,two,4,1.162502
4,a,one,5,0.770933


##### .size()

In [68]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

Take note that any missing values in a group key will be excluded from the result

In [69]:
def f(data):
    if data.dtype in ['int64','float64']:
            out=[np.nan]
            i=0
            j=1
            while j<len(data):
                    out.append(data[i]+data[j])
                    i+=1
                    j+=1
    else: out=data
    return out    
df.apply(f)

Unnamed: 0,key1,key2,dat1,data2
0,a,one,,
1,a,two,3.0,-1.040984
2,b,one,5.0,-1.372856
3,b,two,7.0,1.396412
4,a,one,9.0,1.933435


In [70]:
df.apply(np.dtype)

key1      object
key2      object
dat1       int64
data2    float64
dtype: object

### Iterating Over Groups

The GroupBy object supports iteration, generating a sequence of 2-tuples containing
 the group name along with the chunk of data. Consider the following.

In [71]:
for group_name,group in df.groupby('key1'):
    print(group_name)
    print(group)

a
  key1 key2  dat1     data2
0    a  one     1  0.565781
1    a  two     2 -1.606766
4    a  one     5  0.770933
b
  key1 key2  dat1     data2
2    b  one     3  0.233910
3    b  two     4  1.162502


In [72]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2  dat1     data2
0    a  one     1  0.565781
4    a  one     5  0.770933
('a', 'two')
  key1 key2  dat1     data2
1    a  two     2 -1.606766
('b', 'one')
  key1 key2  dat1    data2
2    b  one     3  0.23391
('b', 'two')
  key1 key2  dat1     data2
3    b  two     4  1.162502


#### axis=1

 By default groupby groups on axis=0, but you can group on any of the other axes.
 For example, we could group the columns of our example df here by dtype like so:

In [84]:
df=df.astype({'dat1':'float64'})
group=df.groupby(df.dtypes,axis=1)

for key , groups in group :
    print(key)
    print(groups)

float64
   dat1     data2
0   1.0  0.565781
1   2.0 -1.606766
2   3.0  0.233910
3   4.0  1.162502
4   5.0  0.770933
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [89]:
group.sum()

Unnamed: 0,float64,object
0,1.565781,aone
1,0.393234,atwo
2,3.23391,bone
3,5.162502,btwo
4,5.770933,aone


###  Selecting a Column or Subset of Columns

In [81]:
dt.groupby('key1')['data1'].mean()  #series

key1
a    2.666667
b    3.500000
Name: data1, dtype: float64

In [83]:
df.groupby('key1')[['data2']].mean()  #result is dataframe

Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,-0.090017
b,0.698206


#### Grouping with Dicts and Series

In [90]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people    

Unnamed: 0,a,b,c,d,e
Joe,1.086584,1.169883,-0.485867,-0.934152,0.601311
Steve,-0.160151,0.431341,-1.031827,-0.583152,0.745574
Wes,-1.369583,0.855408,-0.089537,-0.539692,0.349162
Jim,0.151899,0.711476,0.01237,0.08249,-0.865539
Travis,0.813028,0.247858,-2.119541,0.648248,0.665476


In [92]:
people.iloc[2:3,[1,2]]=np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,1.086584,1.169883,-0.485867,-0.934152,0.601311
Steve,-0.160151,0.431341,-1.031827,-0.583152,0.745574
Wes,-1.369583,,,-0.539692,0.349162
Jim,0.151899,0.711476,0.01237,0.08249,-0.865539
Travis,0.813028,0.247858,-2.119541,0.648248,0.665476


Now, suppose I have a group correspondence for the columns and want to sum
 together the columns by group

In [97]:
mapping = {'a': 'red','b': 'red',  'c': 'blue',
            'd': 'blue', 'e': 'red', 'f' : 'orange'}
people.groupby(mapping,axis='columns').sum()

Unnamed: 0,blue,red
Joe,-1.420019,2.857777
Steve,-1.614979,1.016764
Wes,-0.539692,-1.020421
Jim,0.09486,-0.002163
Travis,-1.471293,1.726362


In [95]:
people.groupby(mapping,axis='columns').sum().sum()

blue   -4.951124
red     4.578319
dtype: float64

### Grouping with functions

Using Python functions is a more generic way of defining a group mapping compared
 with a dict or Series. Any function passed as a group key will be called once per index
 value, with the return values being used as the group names. 

In [99]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [100]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.1311,1.881359,-0.473497,-1.391355,0.084934
5,-0.160151,0.431341,-1.031827,-0.583152,0.745574
6,0.813028,0.247858,-2.119541,0.648248,0.665476


In [103]:
people

Unnamed: 0,a,b,c,d,e
Joe,1.086584,1.169883,-0.485867,-0.934152,0.601311
Steve,-0.160151,0.431341,-1.031827,-0.583152,0.745574
Wes,-1.369583,,,-0.539692,0.349162
Jim,0.151899,0.711476,0.01237,0.08249,-0.865539
Travis,0.813028,0.247858,-2.119541,0.648248,0.665476


Mixing functions with arrays, dicts, or Series is not a problem as everything gets con
verted to arrays internally:
 

In [105]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len,key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.282999,1.169883,-0.485867,-1.473844,0.950473
3,two,0.151899,0.711476,0.01237,0.08249,-0.865539
5,one,-0.160151,0.431341,-1.031827,-0.583152,0.745574
6,two,0.813028,0.247858,-2.119541,0.648248,0.665476


### Grouping by index levels

In [110]:
columns=pd.MultiIndex.from_arrays([['us','us','up','us','up'],['1','2','1','3','2']],names=['city','tnr'])

dt=pd.DataFrame(np.arange(20).reshape(4,5),columns=columns)
dt

city,us,us,up,us,up
tnr,1,2,1,3,2
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [112]:
dt.groupby('city',axis=1).count()

city,up,us
0,2,3
1,2,3
2,2,3
3,2,3


### 10.2  Data Aggregation

Aggregations refer to any data transformation that produces scalar values from
 arrays. The preceding examples have used several of them, including mean, count,
 min, and max

 To use your own aggregation functions, pass any function that aggregates an array to
 the aggregate or agg method:

In [113]:
df=pd.DataFrame({'data1':np.random.randn(5),'data2':np.random.randn(5),'key1':['one','two','one','one','two'],'key2':['a','b','c','a','b']})
df

Unnamed: 0,data1,data2,key1,key2
0,0.003384,0.477294,one,a
1,0.717271,0.198483,two,b
2,0.762244,-1.651884,one,c
3,-0.384858,-2.028989,one,a
4,-1.840236,0.279252,two,b


In [115]:
def distnce(x):
    return x.min()-x.max()
df.groupby('key1').agg(distnce)

  results[key] = self.aggregate(func)


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-1.147101,-2.506283
two,-2.557507,-0.080768


You may notice that some methods like describe also work, even though they are not
 aggregations, strictly speaking

In [116]:
df.groupby('key1').describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
one,3.0,0.126923,0.583444,-0.384858,-0.190737,0.003384,0.382814,0.762244,3.0,-1.06786,1.351361,-2.028989,-1.840437,-1.651884,-0.587295,0.477294
two,2.0,-0.561483,1.80843,-1.840236,-1.200859,-0.561483,0.077894,0.717271,2.0,0.238868,0.057112,0.198483,0.218676,0.238868,0.25906,0.279252


###  Column-Wise and Multiple Function Application.

In [118]:
import pandas as pd

url = 'https://raw.githubusercontent.com/mono0926/python-for-data-analytics/master/ch08/tips.csv'

# Specify delimiter as comma (assuming it's a comma-separated file)
tips = pd.read_csv(url)
tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [119]:
tips['tip_pct']=tips['tip']/tips['total_bill']
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [121]:
#applying single function

group=tips.groupby('day')
group[['tip_pct']].agg('mean')

Unnamed: 0_level_0,tip_pct
day,Unnamed: 1_level_1
Fri,0.169913
Sat,0.153152
Sun,0.166897
Thur,0.161276


In [133]:
def distnce(x):
    return min(x)-max(x)


Applying multiple functions.

In [125]:
group['tip_pct'].agg(['mean','std',distnce])

Unnamed: 0_level_0,mean,std,distnce
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,0.169913,0.047665,-0.159925
Sat,0.153152,0.051293,-0.290095
Sun,0.166897,0.084739,-0.650898
Thur,0.161276,0.038652,-0.19335


In [126]:
group.agg(['mean','std',distnce])

Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,mean,std,distnce,mean,std,distnce,mean,std,distnce,mean,std,distnce
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Fri,17.151579,8.30266,-34.42,2.734737,1.019577,-3.73,2.105263,0.567131,-3,0.169913,0.047665,-0.159925
Sat,20.441379,9.480419,-47.74,2.993103,1.631014,-9.0,2.517241,0.819275,-4,0.153152,0.051293,-0.290095
Sun,21.41,8.832122,-40.92,3.255132,1.23488,-5.49,2.842105,1.007341,-4,0.166897,0.084739,-0.650898
Thur,17.682742,7.88617,-35.6,2.771452,1.240223,-5.45,2.451613,1.066285,-5,0.161276,0.038652,-0.19335


In [127]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542


 If you
 pass a list of (name, function) tuples, the first element of each tuple will be used as
 the DataFrame column names

In [129]:
tips.groupby(['day','smoker'])['tip'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])

Unnamed: 0_level_0,Unnamed: 1_level_0,avrge_tips,minimum_tips,tips_dstnce
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,2.8125,1.5,-2.0
Fri,Yes,2.714,1.0,-3.73
Sat,No,3.102889,1.0,-8.0
Sat,Yes,2.875476,1.0,-9.0
Sun,No,3.167895,1.01,-4.99
Sun,Yes,3.516842,1.5,-5.0
Thur,No,2.673778,1.25,-5.45
Thur,Yes,3.03,2.0,-3.0


In [130]:
tips.groupby(['day','smoker'])['tip','tip_pct'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])

  tips.groupby(['day','smoker'])['tip','tip_pct'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,avrge_tips,minimum_tips,tips_dstnce,avrge_tips,minimum_tips,tips_dstnce
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,2.8125,1.5,-2.0,0.15165,0.120385,-0.067349
Fri,Yes,2.714,1.0,-3.73,0.174783,0.103555,-0.159925
Sat,No,3.102889,1.0,-8.0,0.158048,0.056797,-0.235193
Sat,Yes,2.875476,1.0,-9.0,0.147906,0.035638,-0.290095
Sun,No,3.167895,1.01,-4.99,0.160113,0.059447,-0.193226
Sun,Yes,3.516842,1.5,-5.0,0.18725,0.06566,-0.644685
Thur,No,2.673778,1.25,-5.45,0.160298,0.072961,-0.19335
Thur,Yes,3.03,2.0,-3.0,0.163863,0.090014,-0.15124


Now, suppose you wanted to apply potentially different functions to one or more of
 the columns. To do this, pass a dict to agg that contains a mapping of column names
 to any of the function specifications listed so far.

In [137]:

tips.groupby(['day','smoker']).agg({'tip':['mean','min',distnce],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,distnce,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,2.8125,1.5,-2.0,9
Fri,Yes,2.714,1.0,-3.73,31
Sat,No,3.102889,1.0,-8.0,115
Sat,Yes,2.875476,1.0,-9.0,104
Sun,No,3.167895,1.01,-4.99,167
Sun,Yes,3.516842,1.5,-5.0,49
Thur,No,2.673778,1.25,-5.45,112
Thur,Yes,3.03,2.0,-3.0,40


 A DataFrame will have hierarchical columns only if multiple functions are applied to
 at least one column.

###  Returning Aggregated Data Without Row Indexes

In [141]:
clctn=tips.groupby(['day','smoker'],as_index=False).mean()
clctn

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


##  10.3 Apply: General split-apply-combine

suppose you wanted to select the top
 five tip_pct values by group. First, write a function that selects the rows with the
 largest values in a particular column:

In [143]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542


In [172]:
def top_tip(df,n=5,col='tip_pct'):
    return df.sort_values(by=col)[-n:]

In [147]:
tips.groupby('smoker').apply(top_tip)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


 If you pass a function to apply that takes other arguments or keywords, you can pass
 these after the function:

In [152]:
tips.groupby(['day','smoker']).apply(top_tip,n=2,col='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Fri,No,91,22.49,3.5,Male,No,Fri,Dinner,2,0.155625
Fri,No,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
Fri,Yes,90,28.97,3.0,Male,Yes,Fri,Dinner,2,0.103555
Fri,Yes,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Sat,No,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
Sat,No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Sat,Yes,102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
Sat,Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Sun,No,112,38.07,4.0,Male,No,Sun,Dinner,3,0.10507
Sun,No,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [168]:
tips.groupby('smoker')['tip_pct'].apply(lambda x: x.describe()).unstack(level=0)

smoker,No,Yes
count,151.0,93.0
mean,0.159328,0.163196
std,0.03991,0.085119
min,0.056797,0.035638
25%,0.136906,0.106771
50%,0.155625,0.153846
75%,0.185014,0.195059
max,0.29199,0.710345


### Suppressing the Group Keys

In [173]:
tips.groupby('smoker').apply(top_tip)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [175]:
tips.groupby('smoker',group_keys=False).apply(top_tip)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


##  Quantiles and bucket analysis.

Combining =cut and qcuts with groupby makes it convenient to perform bucket or
 quantile analysis on a dataset. Consider a simple random dataset and an equal-length
 bucket categorization using cut

In [180]:
df=pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
bins=pd.cut(df['data1'],4)
group=df['data2'].groupby(bins)

In [182]:
def summary(group):
    return {'min':group.min(),'max':group.max(),'count':group.count()}

group.apply(summary).unstack()

Unnamed: 0_level_0,min,max,count
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-3.1, -1.298]",-1.747493,3.767249,104.0
"(-1.298, 0.498]",-3.930439,2.624286,557.0
"(0.498, 2.293]",-3.180473,3.029283,327.0
"(2.293, 4.089]",-1.42236,1.353759,12.0


 These were equal-length buckets; to compute equal-size buckets based on sample
 quantiles, use qcut. I’ll pass labels=False to just get quantile numbers:

In [187]:
group=pd.qcut(df.data1,10,labels=False)
df.data2.groupby(group).apply(summary).unstack()

Unnamed: 0_level_0,min,max,count
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-1.747493,3.767249,100.0
1,-3.930439,2.611205,100.0
2,-2.099951,2.159147,100.0
3,-2.470464,2.487199,100.0
4,-2.65402,2.594858,100.0
5,-3.585051,2.624286,100.0
6,-2.204021,2.417867,100.0
7,-3.180473,1.711967,100.0
8,-2.228244,3.029283,100.0
9,-2.338924,2.516366,100.0


###  Filling Missing Values with Group-Specific Values.

In [188]:
s=pd.Series(np.arange(5))
s[::2]=np.nan
s

0    NaN
1    1.0
2    NaN
3    3.0
4    NaN
dtype: float64

In [189]:
s.fillna(s.mean())

0    2.0
1    1.0
2    2.0
3    3.0
4    2.0
dtype: float64

In [195]:
states = ['Ohio', 'New York', 'Vermont', 'Florida','Oregon', 'Nevada', 'California', 'Idaho']
side=['east']*4+['west']*4 
pop=[4,5,6,np.nan,7,8,9,np.nan]

dt=pd.DataFrame({'states':states,'side':side,'pop':pop})
dt

Unnamed: 0,states,side,pop
0,Ohio,east,4.0
1,New York,east,5.0
2,Vermont,east,6.0
3,Florida,east,
4,Oregon,west,7.0
5,Nevada,west,8.0
6,California,west,9.0
7,Idaho,west,


In [196]:
dt.groupby('side')['pop'].apply(lambda x :x.fillna(x.mean()))

0    4.0
1    5.0
2    6.0
3    5.0
4    7.0
5    8.0
6    9.0
7    8.0
Name: pop, dtype: float64

##### using dictionary and group method

In [202]:
dictn={'east':4,'west':5}
dt.groupby('side')['pop'].apply(lambda x:x.fillna(dictn[x.name]))

0    4.0
1    5.0
2    6.0
3    4.0
4    7.0
5    8.0
6    9.0
7    5.0
Name: pop, dtype: float64