In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('titanic.csv')
embarked_grouped = df.groupby('Embarked')

Lambda or inline functions are really useful when applying **agg** as they provide a quick and convenient method of applying the function across the data

In [2]:
embarked_grouped.agg(lambda x: x.values[0])

Unnamed: 0_level_0,Unnamed: 0,Cabin,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,1,C85,71.2833,1,PC 17599,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female,1,1.0
Q,5,,8.4583,3,330877,,"Moran, Mr. James",0,male,0,0.0
S,0,,7.25,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0


We can also apply multiple functions at once:

In [3]:
embarked_grouped.agg([lambda x: x.values[0], np.mean, np.std])

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Fare,Fare,Fare,Pclass,Pclass,Pclass,Age,Age,Age,Parch,Parch,Parch,SibSp,SibSp,SibSp,Survived,Survived,Survived
Unnamed: 0_level_1,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,...,std,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std
Embarked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C,1,689.655556,383.355741,71.2833,62.336267,84.185996,1,1.851852,0.936802,38.0,...,15.258092,0,0.37037,0.670579,1,0.4,0.554803,1.0,0.553571,0.498608
Q,5,667.593496,390.418813,8.4583,12.409012,13.616133,3,2.894309,0.380099,,...,15.045784,0,0.113821,0.531056,0,0.341463,0.885487,0.0,0.38961,0.49086
S,0,642.095186,374.290951,7.25,27.418824,37.096402,3,2.347921,0.784126,22.0,...,14.047507,0,0.426696,0.943989,1,0.550328,1.161723,0.0,0.336957,0.473037


We can also use a dictionary to specific different functions to apply to different columns:

In [4]:
embarked_grouped.agg({
    'Fare': np.sum,
    'Age': lambda x: x.values[0]
})

Unnamed: 0_level_0,Fare,Age
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,16830.7922,38.0
Q,1526.3085,
S,25033.3862,22.0


We can also group by more than one column by providing a list of columns

In [5]:
age_embarked_grouped = df.groupby(['Sex', 'Embarked'])
age_embarked_grouped.groups


{('male',
  'S'): Int64Index([   0,    4,    6,    7,   12,   13,   17,   20,   21,   23,
             ...
             1283, 1284, 1285, 1289, 1292, 1294, 1297, 1304, 1306, 1307],
            dtype='int64', length=623),
 ('female',
  'C'): Int64Index([   1,    9,   19,   31,   39,   43,   52,  111,  114,  128,
             ...
             1238, 1241, 1252, 1255, 1259, 1262, 1266, 1288, 1293, 1305],
            dtype='int64', length=113),
 ('female',
  'S'): Int64Index([   2,    3,    8,   10,   11,   14,   15,   18,   24,   25,
             ...
             1265, 1267, 1273, 1274, 1276, 1282, 1286, 1291, 1300, 1303],
            dtype='int64', length=291),
 ('male',
  'Q'): Int64Index([   5,   16,   46,  116,  126,  143,  171,  188,  196,  214,  245,
              260,  278,  280,  301,  364,  388,  411,  421,  428,  459,  468,
              510,  517,  525,  552,  560,  613,  626,  629,  703,  718,  749,
              768,  776,  778,  787,  790,  825,  828,  890,  891,  893,  907,
