In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.DataFrame({
    'key1': [1,1,1,2,2,3],
    'key2': [12,np.nan, 45, 12, 45, 12],
    'data': ['Dynamo', 'Snax', 'Mortal', 'Scout', 'Joker', 'Krutika']
})

groupby_obj = temp_df.groupby(['key1'])
key_2_gp = groupby_obj['key2']

### Aggregation - df.agg()

In [5]:
key_2_gp.agg('mean')

key1
1    28.5
2    28.5
3    12.0
Name: key2, dtype: float64

In [6]:
key_2_gp.agg(lambda x: x.max() - x.min())

key1
1    33.0
2    33.0
3     0.0
Name: key2, dtype: float64

In [7]:
key_2_gp.agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,28.5,23.334524,12.0,45.0
2,28.5,23.334524,12.0,45.0
3,12.0,,12.0,12.0


In [8]:
key_2_gp.agg([('agg1','mean'), ('agg2', 'std')])

Unnamed: 0_level_0,agg1,agg2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,28.5,23.334524
2,28.5,23.334524
3,12.0,


In [38]:
groupby_obj.agg({
    'key2': ['mean', 'std', 'min', 'max'],
    'data': len
})

Unnamed: 0_level_0,key2,key2,key2,key2,data
Unnamed: 0_level_1,mean,std,min,max,len
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,28.5,23.334524,12.0,45.0,3
2,28.5,23.334524,12.0,45.0,2
3,12.0,,12.0,12.0,1


In [36]:
df = pd.DataFrame({
    'data1': [1,2,3,4,5],
    'data2': [1,2,3,4,5],
    'data3': [1,2,3,4,5]
}, index = ['a', 'b', 'c', 'd', 'e'])

df.unstack()

data1  a    1
       b    2
       c    3
       d    4
       e    5
data2  a    1
       b    2
       c    3
       d    4
       e    5
data3  a    1
       b    2
       c    3
       d    4
       e    5
dtype: int64

### Split Apply Combine - apply()

In [41]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [46]:
def tempfun(df, n = 5, column = 'col_name'):
    return df.sort_values(by=column)[-n:]

df.groupby(['smoker', 'day'], observed = False)[['total_bill', 'tip']].apply(tempfun, n = 1, column = 'total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,Thur,197,43.11,5.0
Yes,Fri,95,40.17,4.73
Yes,Sat,170,50.81,10.0
Yes,Sun,182,45.35,3.5
No,Thur,142,41.19,5.0
No,Fri,94,22.75,3.25
No,Sat,212,48.33,9.0
No,Sun,156,48.17,5.0


### Pivot Tables - pivot_table()

In [56]:
df.pivot_table(['total_bill'], index = ['day', 'smoker'], columns = ['time'], aggfunc = len)

  df.pivot_table(['total_bill'], index = ['day', 'smoker'], columns = ['time'], aggfunc = len)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill
Unnamed: 0_level_1,time,Lunch,Dinner
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2
Thur,Yes,17.0,
Thur,No,44.0,1.0
Fri,Yes,6.0,9.0
Fri,No,1.0,3.0
Sat,Yes,,42.0
Sat,No,,45.0
Sun,Yes,,19.0
Sun,No,,57.0


### Cross Tabulation - crosstab()

In [58]:
pd.crosstab([df['smoker'], df['time']], [df['day'], df['sex']])

Unnamed: 0_level_0,day,Thur,Thur,Fri,Fri,Sat,Sat,Sun,Sun
Unnamed: 0_level_1,sex,Male,Female,Male,Female,Male,Female,Male,Female
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Yes,Lunch,10,7,3,3,0,0,0,0
Yes,Dinner,0,0,5,4,27,15,15,4
No,Lunch,20,24,0,1,0,0,0,0
No,Dinner,0,1,2,1,32,13,43,14
