In [64]:
import pandas as pd
import numpy as np
df=pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'dat1':[1,2,3,4,5],'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,dat1,data2
0,a,one,1,0.167545
1,a,two,2,-0.232053
2,b,one,3,0.807767
3,b,two,4,0.366331
4,a,one,5,1.394397


In [65]:
df.groupby('key1').mean()

Unnamed: 0_level_0,dat1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.666667,0.443296
b,3.5,0.587049


In [66]:
df.groupby(['key1','key2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,dat1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,6,1.561942
a,two,2,-0.232053
b,one,3,0.807767
b,two,4,0.366331


###### Finding mean with groupby

In [67]:
df.groupby('key1')['dat1'].mean()

key1
a    2.666667
b    3.500000
Name: dat1, dtype: float64

In [68]:
df['dat1'].groupby(df['key1']).mean()

key1
a    2.666667
b    3.500000
Name: dat1, dtype: float64

The result index has the name 'key1' because the DataFrame column df['key1'] did.

In [69]:
df['dat1'].groupby([df['key1'],df['key2']]).mean()

key1  key2
a     one     3.0
      two     2.0
b     one     3.0
      two     4.0
Name: dat1, dtype: float64

 Here we grouped the data using two keys, and the resulting Series now has a hier
archical index consisting of the unique pairs of keys observed

In [70]:
rslt=df['dat1'].groupby([df['key1'],df['key2']]).mean()
rslt

key1  key2
a     one     3.0
      two     2.0
b     one     3.0
      two     4.0
Name: dat1, dtype: float64

In [71]:
rslt.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.0,2.0
b,3.0,4.0


In this example, the group keys are all Series, though they could be any arrays of the right length:

In [72]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [73]:
df['dat1']

0    1
1    2
2    3
3    4
4    5
Name: dat1, dtype: int64

In [74]:
df['dat1'].groupby([states,years]).mean()

California  2005    2.0
            2006    3.0
Ohio        2005    2.5
            2006    5.0
Name: dat1, dtype: float64

In [75]:
dt=df.rename(columns={'dat1':'data1'})
dt

Unnamed: 0,key1,key2,data1,data2
0,a,one,1,0.167545
1,a,two,2,-0.232053
2,b,one,3,0.807767
3,b,two,4,0.366331
4,a,one,5,1.394397


##### .size()

In [76]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

Take note that any missing values in a group key will be excluded from the result

In [77]:
def f(data):
    if data.dtype in ['int64','float64']:
            out=[np.nan]
            i=0
            j=1
            while j<len(data):
                    out.append(data[i]+data[j])
                    i+=1
                    j+=1
    else: out=data
    return out    
df.apply(f)

Unnamed: 0,key1,key2,dat1,data2
0,a,one,,
1,a,two,3.0,-0.064507
2,b,one,5.0,0.575714
3,b,two,7.0,1.174098
4,a,one,9.0,1.760728


In [78]:
df.apply(np.dtype)

key1      object
key2      object
dat1       int64
data2    float64
dtype: object

### Iterating Over Groups

The GroupBy object supports iteration, generating a sequence of 2-tuples containing
 the group name along with the chunk of data. Consider the following.

In [79]:
for group_name,group in df.groupby('key1'):
    print(group_name)
    print(group)

a
  key1 key2  dat1     data2
0    a  one     1  0.167545
1    a  two     2 -0.232053
4    a  one     5  1.394397
b
  key1 key2  dat1     data2
2    b  one     3  0.807767
3    b  two     4  0.366331


In [80]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2  dat1     data2
0    a  one     1  0.167545
4    a  one     5  1.394397
('a', 'two')
  key1 key2  dat1     data2
1    a  two     2 -0.232053
('b', 'one')
  key1 key2  dat1     data2
2    b  one     3  0.807767
('b', 'two')
  key1 key2  dat1     data2
3    b  two     4  0.366331


#### axis=1

 By default groupby groups on axis=0, but you can group on any of the other axes.
 For example, we could group the columns of our example df here by dtype like so:

In [81]:
df=df.astype({'dat1':'float64'})
group=df.groupby(df.dtypes,axis=1)

for key , groups in group :
    print(key)
    print(groups)

float64
   dat1     data2
0   1.0  0.167545
1   2.0 -0.232053
2   3.0  0.807767
3   4.0  0.366331
4   5.0  1.394397
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [82]:
group.sum()

Unnamed: 0,float64,object
0,1.167545,aone
1,1.767947,atwo
2,3.807767,bone
3,4.366331,btwo
4,6.394397,aone


###  Selecting a Column or Subset of Columns

In [83]:
dt.groupby('key1')['data1'].mean()  #series

key1
a    2.666667
b    3.500000
Name: data1, dtype: float64

In [84]:
df.groupby('key1')[['data2']].mean()  #result is dataframe

Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,0.443296
b,0.587049


#### Grouping with Dicts and Series

In [85]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people    

Unnamed: 0,a,b,c,d,e
Joe,0.852295,0.048825,0.36497,0.586306,-0.369619
Steve,1.705422,0.28498,0.293446,-1.499636,0.307423
Wes,0.539596,-1.206069,0.361283,-1.294013,0.434639
Jim,-0.845012,0.247692,0.528833,0.118299,1.685191
Travis,-1.078579,-1.018839,-0.590414,-1.424182,0.516718


In [86]:
people.iloc[2:3,[1,2]]=np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.852295,0.048825,0.36497,0.586306,-0.369619
Steve,1.705422,0.28498,0.293446,-1.499636,0.307423
Wes,0.539596,,,-1.294013,0.434639
Jim,-0.845012,0.247692,0.528833,0.118299,1.685191
Travis,-1.078579,-1.018839,-0.590414,-1.424182,0.516718


Now, suppose I have a group correspondence for the columns and want to sum
 together the columns by group

In [87]:
mapping = {'a': 'red','b': 'red',  'c': 'blue',
            'd': 'blue', 'e': 'red', 'f' : 'orange'}
people.groupby(mapping,axis='columns').sum()

Unnamed: 0,blue,red
Joe,0.951276,0.531501
Steve,-1.206189,2.297825
Wes,-1.294013,0.974236
Jim,0.647131,1.087871
Travis,-2.014596,-1.580699


In [88]:
people.groupby(mapping,axis='columns').sum().sum()

blue   -2.916392
red     3.310733
dtype: float64

### Grouping with functions

Using Python functions is a more generic way of defining a group mapping compared
 with a dict or Series. Any function passed as a group key will be called once per index
 value, with the return values being used as the group names. 

In [89]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [90]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.54688,0.296517,0.893802,-0.589408,1.750211
5,1.705422,0.28498,0.293446,-1.499636,0.307423
6,-1.078579,-1.018839,-0.590414,-1.424182,0.516718


In [91]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.852295,0.048825,0.36497,0.586306,-0.369619
Steve,1.705422,0.28498,0.293446,-1.499636,0.307423
Wes,0.539596,,,-1.294013,0.434639
Jim,-0.845012,0.247692,0.528833,0.118299,1.685191
Travis,-1.078579,-1.018839,-0.590414,-1.424182,0.516718


Mixing functions with arrays, dicts, or Series is not a problem as everything gets con
verted to arrays internally:
 

In [92]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len,key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1.391891,0.048825,0.36497,-0.707707,0.06502
3,two,-0.845012,0.247692,0.528833,0.118299,1.685191
5,one,1.705422,0.28498,0.293446,-1.499636,0.307423
6,two,-1.078579,-1.018839,-0.590414,-1.424182,0.516718


### Grouping by index levels

In [93]:
columns=pd.MultiIndex.from_arrays([['us','us','up','us','up'],['1','2','1','3','2']],names=['city','tnr'])

dt=pd.DataFrame(np.arange(20).reshape(4,5),columns=columns)
dt

city,us,us,up,us,up
tnr,1,2,1,3,2
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [94]:
dt.groupby('city',axis=1).count()

city,up,us
0,2,3
1,2,3
2,2,3
3,2,3


### 10.2  Data Aggregation

Aggregations refer to any data transformation that produces scalar values from
 arrays. The preceding examples have used several of them, including mean, count,
 min, and max

 To use your own aggregation functions, pass any function that aggregates an array to
 the aggregate or agg method:

In [95]:
df=pd.DataFrame({'data1':np.random.randn(5),'data2':np.random.randn(5),'key1':['one','two','one','one','two'],'key2':['a','b','c','a','b']})
df

Unnamed: 0,data1,data2,key1,key2
0,0.767425,1.955515,one,a
1,0.423441,0.034958,two,b
2,-0.021476,0.790199,one,c
3,-2.683165,-0.765673,one,a
4,-0.536937,-0.765732,two,b


In [96]:
def distnce(x):
    return x.min()-x.max()
df.groupby('key1').agg(distnce)

  results[key] = self.aggregate(func)


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-3.45059,-2.721188
two,-0.960379,-0.80069


You may notice that some methods like describe also work, even though they are not
 aggregations, strictly speaking

In [97]:
df.groupby('key1').describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
one,3.0,-0.645739,1.808016,-2.683165,-1.352321,-0.021476,0.372975,0.767425,3.0,0.660013,1.365257,-0.765673,0.012263,0.790199,1.372857,1.955515
two,2.0,-0.056748,0.67909,-0.536937,-0.296843,-0.056748,0.183347,0.423441,2.0,-0.365387,0.566173,-0.765732,-0.56556,-0.365387,-0.165215,0.034958


###  Column-Wise and Multiple Function Application.

In [98]:
import pandas as pd

url = 'https://raw.githubusercontent.com/mono0926/python-for-data-analytics/master/ch08/tips.csv'

# Specify delimiter as comma (assuming it's a comma-separated file)
tips = pd.read_csv(url)
tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [99]:
tips['tip_pct']=tips['tip']/tips['total_bill']
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [100]:
#applying single function

group=tips.groupby('day')
group[['tip_pct']].agg('mean')

Unnamed: 0_level_0,tip_pct
day,Unnamed: 1_level_1
Fri,0.169913
Sat,0.153152
Sun,0.166897
Thur,0.161276


In [101]:
def distnce(x):
    return min(x)-max(x)


Applying multiple functions.

In [102]:
group['tip_pct'].agg(['mean','std',distnce])

Unnamed: 0_level_0,mean,std,distnce
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,0.169913,0.047665,-0.159925
Sat,0.153152,0.051293,-0.290095
Sun,0.166897,0.084739,-0.650898
Thur,0.161276,0.038652,-0.19335


In [103]:
group.agg(['mean','std',distnce])

Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,mean,std,distnce,mean,std,distnce,mean,std,distnce,mean,std,distnce
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Fri,17.151579,8.30266,-34.42,2.734737,1.019577,-3.73,2.105263,0.567131,-3,0.169913,0.047665,-0.159925
Sat,20.441379,9.480419,-47.74,2.993103,1.631014,-9.0,2.517241,0.819275,-4,0.153152,0.051293,-0.290095
Sun,21.41,8.832122,-40.92,3.255132,1.23488,-5.49,2.842105,1.007341,-4,0.166897,0.084739,-0.650898
Thur,17.682742,7.88617,-35.6,2.771452,1.240223,-5.45,2.451613,1.066285,-5,0.161276,0.038652,-0.19335


In [104]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542


 If you
 pass a list of (name, function) tuples, the first element of each tuple will be used as
 the DataFrame column names

In [105]:
tips.groupby(['day','smoker'])['tip'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])

Unnamed: 0_level_0,Unnamed: 1_level_0,avrge_tips,minimum_tips,tips_dstnce
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,2.8125,1.5,-2.0
Fri,Yes,2.714,1.0,-3.73
Sat,No,3.102889,1.0,-8.0
Sat,Yes,2.875476,1.0,-9.0
Sun,No,3.167895,1.01,-4.99
Sun,Yes,3.516842,1.5,-5.0
Thur,No,2.673778,1.25,-5.45
Thur,Yes,3.03,2.0,-3.0


In [106]:
tips.groupby(['day','smoker'])['tip','tip_pct'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])

  tips.groupby(['day','smoker'])['tip','tip_pct'].agg([('avrge_tips','mean'),('minimum_tips','min'),('tips_dstnce',distnce)])


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,avrge_tips,minimum_tips,tips_dstnce,avrge_tips,minimum_tips,tips_dstnce
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,2.8125,1.5,-2.0,0.15165,0.120385,-0.067349
Fri,Yes,2.714,1.0,-3.73,0.174783,0.103555,-0.159925
Sat,No,3.102889,1.0,-8.0,0.158048,0.056797,-0.235193
Sat,Yes,2.875476,1.0,-9.0,0.147906,0.035638,-0.290095
Sun,No,3.167895,1.01,-4.99,0.160113,0.059447,-0.193226
Sun,Yes,3.516842,1.5,-5.0,0.18725,0.06566,-0.644685
Thur,No,2.673778,1.25,-5.45,0.160298,0.072961,-0.19335
Thur,Yes,3.03,2.0,-3.0,0.163863,0.090014,-0.15124


Now, suppose you wanted to apply potentially different functions to one or more of
 the columns. To do this, pass a dict to agg that contains a mapping of column names
 to any of the function specifications listed so far.

In [107]:

tips.groupby(['day','smoker']).agg({'tip':['mean','min',distnce],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,distnce,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,2.8125,1.5,-2.0,9
Fri,Yes,2.714,1.0,-3.73,31
Sat,No,3.102889,1.0,-8.0,115
Sat,Yes,2.875476,1.0,-9.0,104
Sun,No,3.167895,1.01,-4.99,167
Sun,Yes,3.516842,1.5,-5.0,49
Thur,No,2.673778,1.25,-5.45,112
Thur,Yes,3.03,2.0,-3.0,40


 A DataFrame will have hierarchical columns only if multiple functions are applied to
 at least one column.

###  Returning Aggregated Data Without Row Indexes

In [108]:
clctn=tips.groupby(['day','smoker'],as_index=False).mean()
clctn

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


##  10.3 Apply: General split-apply-combine

suppose you wanted to select the top
 five tip_pct values by group. First, write a function that selects the rows with the
 largest values in a particular column:

In [109]:
tips.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542


In [110]:
def top_tip(df,n=5,col='tip_pct'):
    return df.sort_values(by=col)[-n:]

In [111]:
tips.groupby('smoker').apply(top_tip)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


 If you pass a function to apply that takes other arguments or keywords, you can pass
 these after the function:

In [112]:
tips.groupby(['day','smoker']).apply(top_tip,n=2,col='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Fri,No,91,22.49,3.5,Male,No,Fri,Dinner,2,0.155625
Fri,No,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
Fri,Yes,90,28.97,3.0,Male,Yes,Fri,Dinner,2,0.103555
Fri,Yes,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Sat,No,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
Sat,No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Sat,Yes,102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433
Sat,Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Sun,No,112,38.07,4.0,Male,No,Sun,Dinner,3,0.10507
Sun,No,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [113]:
tips.groupby('smoker')['tip_pct'].apply(lambda x: x.describe()).unstack(level=0)

smoker,No,Yes
count,151.0,93.0
mean,0.159328,0.163196
std,0.03991,0.085119
min,0.056797,0.035638
25%,0.136906,0.106771
50%,0.155625,0.153846
75%,0.185014,0.195059
max,0.29199,0.710345


### Suppressing the Group Keys

In [114]:
tips.groupby('smoker').apply(top_tip)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [115]:
tips.groupby('smoker',group_keys=False).apply(top_tip)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


##  Quantiles and bucket analysis.

Combining =cut and qcuts with groupby makes it convenient to perform bucket or
 quantile analysis on a dataset. Consider a simple random dataset and an equal-length
 bucket categorization using cut

In [116]:
df=pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
bins=pd.cut(df['data1'],4)
group=df['data2'].groupby(bins)

In [117]:
def summary(group):
    return {'min':group.min(),'max':group.max(),'count':group.count()}

group.apply(summary).unstack()

Unnamed: 0_level_0,min,max,count
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-2.875, -1.369]",-1.906966,2.600014,91.0
"(-1.369, 0.132]",-2.787813,2.943028,486.0
"(0.132, 1.632]",-3.135234,3.423296,380.0
"(1.632, 3.133]",-2.131012,2.289561,43.0


 These were equal-length buckets; to compute equal-size buckets based on sample
 quantiles, use qcut. I’ll pass labels=False to just get quantile numbers:

In [118]:
group=pd.qcut(df.data1,10,labels=False)
df.data2.groupby(group).apply(summary).unstack()

Unnamed: 0_level_0,min,max,count
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-1.906966,2.600014,100.0
1,-2.787813,2.709218,100.0
2,-2.242061,2.307512,100.0
3,-2.398226,2.152408,100.0
4,-1.876735,2.390509,100.0
5,-1.798088,2.943028,100.0
6,-3.135234,2.529707,100.0
7,-1.93386,2.833456,100.0
8,-2.234678,3.423296,100.0
9,-2.131012,2.600293,100.0


###  Filling Missing Values with Group-Specific Values.

In [119]:
s=pd.Series(np.arange(5))
s[::2]=np.nan
s

0    NaN
1    1.0
2    NaN
3    3.0
4    NaN
dtype: float64

In [120]:
s.fillna(s.mean())

0    2.0
1    1.0
2    2.0
3    3.0
4    2.0
dtype: float64

In [6]:
import numpy as np
import pandas as pd

states = ['Ohio', 'New York', 'Vermont', 'Florida','Oregon', 'Nevada', 'California', 'Idaho']
side=['east']*4+['west']*4 
pop=[4,5,6,np.nan,7,8,9,np.nan]

dt=pd.DataFrame({'states':states,'side':side,'pop':pop})
dt

Unnamed: 0,states,side,pop
0,Ohio,east,4.0
1,New York,east,5.0
2,Vermont,east,6.0
3,Florida,east,
4,Oregon,west,7.0
5,Nevada,west,8.0
6,California,west,9.0
7,Idaho,west,


In [4]:
dt.groupby('side')['pop'].apply(lambda x :x.fillna(x.mean()))

0    4.0
1    5.0
2    6.0
3    5.0
4    7.0
5    8.0
6    9.0
7    8.0
Name: pop, dtype: float64

##### using dictionary and group method

In [123]:
dictn={'east':4,'west':5}
dt.groupby('side')['pop'].apply(lambda x:x.fillna(dictn[x.name]))

0    4.0
1    5.0
2    6.0
3    4.0
4    7.0
5    8.0
6    9.0
7    5.0
Name: pop, dtype: float64

#### Random sampling and permutation with group by

In [124]:
suit=['H','S','D','C']
card_val=(list(range(1,11))+[10]*3)
base=['A']+list(range(2,11))+['J',"Q",'K']
card=[]
for i in suit:
    for j in base:
        card.append(str(j)+i)
card[:13]

['AH', '2H', '3H', '4H', '5H', '6H', '7H', '8H', '9H', '10H', 'JH', 'QH', 'KH']

In [125]:
srs=pd.Series(card,index=card_val*4)

In [126]:
def draw(deck,n=5):
    return deck.sample(n)

In [127]:
draw(srs)

2     2H
6     6H
10    JD
3     3S
3     3D
dtype: object

In [128]:
cards=pd.DataFrame({'card':card,'value':card_val*4})
cards.groupby(cards['card'].map(lambda x: x[-1]),group_keys=False).apply(draw,n=2)

Unnamed: 0,card,value
51,KC,10
43,5C,5
38,KD,10
27,2D,2
2,3H,3
7,8H,8
19,7S,7
13,AS,1


### Group Weighted Average and Correlation

In [129]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a','b', 'b', 'b', 'b'],'data': np.random.randn(8),'weights': np.random.rand(8)})
df.head(5)

Unnamed: 0,category,data,weights
0,a,0.983672,0.295165
1,a,-0.381909,0.621283
2,a,-0.834146,0.219922
3,a,-1.028813,0.073309
4,b,0.868782,0.086791


In [130]:
fn=lambda g:np.average(g['data'],weights=g['weights'])
df.groupby('category').apply(lambda g:np.average(g.data,weights=g.weights))

category
a   -0.170125
b   -0.128290
dtype: float64

### Pivot table and cross tabulation.

In [131]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [132]:
tips.pivot_table(index=['day','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [133]:
tips.groupby(['day','smoker'])['size'].agg('mean')

day   smoker
Fri   No        2.250000
      Yes       2.066667
Sat   No        2.555556
      Yes       2.476190
Sun   No        2.929825
      Yes       2.578947
Thur  No        2.488889
      Yes       2.352941
Name: size, dtype: float64

In [137]:
tips.pivot_table(index=['time','day'],columns=['smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip,tip,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,smoker,No,Yes,No,Yes,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Dinner,Fri,2.0,2.222222,2.75,3.003333,0.139622,0.165347,19.233333,19.806667
Dinner,Sat,2.555556,2.47619,3.102889,2.875476,0.158048,0.147906,19.661778,21.276667
Dinner,Sun,2.929825,2.578947,3.167895,3.516842,0.160113,0.18725,20.506667,24.12
Dinner,Thur,2.0,,3.0,,0.159744,,18.78,
Lunch,Fri,3.0,1.833333,3.0,2.28,0.187735,0.188937,15.98,12.323333
Lunch,Thur,2.5,2.352941,2.666364,3.03,0.160311,0.163863,17.075227,19.190588


 We could augment this table to include partial totals by passing margins=True. This
 has the effect of adding All row and column labels, with corresponding values being
 the group statistics for all the data within a single tier:

In [141]:
tips.pivot_table(['tip','tip_pct'],index=['time','day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.75,3.003333,2.94,0.139622,0.165347,0.158916
Dinner,Sat,3.102889,2.875476,2.993103,0.158048,0.147906,0.153152
Dinner,Sun,3.167895,3.516842,3.255132,0.160113,0.18725,0.166897
Dinner,Thur,3.0,,3.0,0.159744,,0.159744
Lunch,Fri,3.0,2.28,2.382857,0.187735,0.188937,0.188765
Lunch,Thur,2.666364,3.03,2.767705,0.160311,0.163863,0.161301
All,,2.991854,3.00871,2.998279,0.159328,0.163196,0.160803


Here, the All values are means without taking into account smoker versus non
smoker (the All columns) or any of the two levels of grouping on the rows (the All
 row).
 To use a different aggregation function, pass it to aggfunc. For example, 'count' or
 len will give you a cross-tabulation (count or frequency) of group sizes:

In [142]:
tips.pivot_table('tip',index=['day','smoker'],columns='time',aggfunc=len,margins=True)

Unnamed: 0_level_0,time,Dinner,Lunch,All
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,3.0,1.0,4
Fri,Yes,9.0,6.0,15
Sat,No,45.0,,45
Sat,Yes,42.0,,42
Sun,No,57.0,,57
Sun,Yes,19.0,,19
Thur,No,1.0,44.0,45
Thur,Yes,,17.0,17
All,,176.0,68.0,244


If some combinations are empty (or otherwise NA), you may wish to pass a
 fill_value

In [147]:
tips.pivot_table('tip',index=['day','smoker'],columns='time',aggfunc=len,margins=True,fill_value=0)

Unnamed: 0_level_0,time,Dinner,Lunch,All
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,3,1,4
Fri,Yes,9,6,15
Sat,No,45,0,45
Sat,Yes,42,0,42
Sun,No,57,0,57
Sun,Yes,19,0,19
Thur,No,1,44,45
Thur,Yes,0,17,17
All,,176,68,244


###### cross table

In [148]:
import pandas as pd

# Creating the DataFrame
data = {
    'Sample': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Nationality': ['USA', 'Japan', 'USA', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'USA'],
    'Handedness': ['Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed']
}

df = pd.DataFrame(data)

# Displaying the DataFrame
df


Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Right-handed
4,5,Japan,Left-handed
5,6,Japan,Right-handed
6,7,USA,Right-handed
7,8,USA,Left-handed
8,9,Japan,Right-handed
9,10,USA,Right-handed


In [152]:
pd.crosstab(df['Nationality'],df['Handedness'],margins=True)

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [153]:
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
