## Load Data

In [1]:
import pandas as pd

In [2]:
# https://rdrr.io/cran/reshape2/man/tips.html
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Exploring With Aggregations

In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
df['tip'].mean()

np.float64(2.99827868852459)

In [7]:
df['tip'].max()

np.float64(10.0)

In [8]:
df[['total_bill','tip']].mean()

total_bill    19.785943
tip            2.998279
dtype: float64

### Basic: Filtering

In [9]:
df[df['sex'] == 'Female']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
11,35.26,5.00,Female,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
226,10.09,2.00,Female,Yes,Fri,Lunch,2
229,22.12,2.88,Female,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [10]:
df[df['sex'] == 'Female']['tip']

0      1.01
4      3.61
11     5.00
14     3.02
16     1.67
       ... 
226    2.00
229    2.88
238    4.67
240    2.00
243    3.00
Name: tip, Length: 87, dtype: float64

In [11]:
print('mean', df[df['sex'] == 'Female']['tip'].mean())
print('max', df[df['sex'] == 'Female']['tip'].max())

mean 2.8334482758620685
max 6.5


In [12]:
male_mask = df['sex'] == 'Male'

In [16]:
print('mean', df[male_mask]['tip'].mean())
print('max', df[male_mask]['tip'].max())

mean 3.0896178343949043
max 10.0


### More Efficient: `groupby`

In [17]:
df.groupby(by='sex')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000026FF45EA0F0>

In [18]:
df.groupby('sex')['tip']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000026FF488B470>

In [19]:
df.groupby('sex')['tip'].mean()

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [20]:
group_by_sex = df.groupby('sex')

In [21]:
group_by_sex['tip'].mean()

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [22]:
group_by_sex['tip'].max()

sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [23]:
group_by_sex['total_bill'].mean()

sex
Female    18.056897
Male      20.744076
Name: total_bill, dtype: float64

#### Alternative Method


##### `agg()` or `aggregate()`

In [28]:
group_by_sex['tip'].agg(max)

  group_by_sex['tip'].agg(max)


sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [29]:
group_by_sex['tip'].aggregate(max)

  group_by_sex['tip'].aggregate(max)


sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [30]:
group_by_sex['tip'].agg('max')

sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [31]:
group_by_sex['tip'].aggregate('mean')

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

`apply()`

In [33]:
group_by_sex['tip'].apply(max)

  group_by_sex['tip'].apply(max)


sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [34]:
group_by_sex['tip'].apply('max')

sex
Female     6.5
Male      10.0
Name: tip, dtype: float64

In [35]:
group_by_sex['tip'].apply('mean')

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

#### Multiple Columns

In [36]:
agg_results = group_by_sex['tip'].mean()
print(type(agg_results))
agg_results

<class 'pandas.core.series.Series'>


sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [37]:
agg_results = group_by_sex[['tip']].mean()
print(type(agg_results))
agg_results

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Female,2.833448
Male,3.089618


In [38]:
# Fails
group_by_sex.mean()

TypeError: agg function failed [how->mean,dtype->object]

In [39]:
# This works
group_by_sex.count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,87,87,87,87,87,87
Male,157,157,157,157,157,157


In [40]:
group_by_sex[['total_bill','tip']].mean()

Unnamed: 0_level_0,total_bill,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,18.056897,2.833448
Male,20.744076,3.089618


#### Multiple Aggregations

In [42]:
group_by_sex['tip'].agg(['mean', max, 'count'])

  group_by_sex['tip'].agg(['mean', max, 'count'])


Unnamed: 0_level_0,mean,max,count
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,2.833448,6.5,87
Male,3.089618,10.0,157


In [43]:
group_by_sex[['total_bill','tip']].agg(['mean', max, 'count'])

  group_by_sex[['total_bill','tip']].agg(['mean', max, 'count'])


Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip
Unnamed: 0_level_1,mean,max,count,mean,max,count
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,18.056897,44.3,87,2.833448,6.5,87
Male,20.744076,50.81,157,3.089618,10.0,157


In [44]:
group_by_sex[['total_bill','tip']].agg(['mean', max, 'count']).T

  group_by_sex[['total_bill','tip']].agg(['mean', max, 'count']).T


Unnamed: 0,sex,Female,Male
total_bill,mean,18.056897,20.744076
total_bill,max,44.3,50.81
total_bill,count,87.0,157.0
tip,mean,2.833448,3.089618
tip,max,6.5,10.0
tip,count,87.0,157.0


In [45]:
df.groupby('sex')[['tip','total_bill']].describe()

Unnamed: 0_level_0,tip,tip,tip,tip,tip,tip,tip,tip,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Female,87.0,2.833448,1.159495,1.0,2.0,2.75,3.5,6.5,87.0,18.056897,8.009209,3.07,12.75,16.4,21.52,44.3
Male,157.0,3.089618,1.489102,1.0,2.0,3.0,3.76,10.0,157.0,20.744076,9.246469,7.25,14.0,18.35,24.71,50.81


In [46]:
df.groupby('sex')['tip'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,87.0,2.833448,1.159495,1.0,2.0,2.75,3.5,6.5
Male,157.0,3.089618,1.489102,1.0,2.0,3.0,3.76,10.0


In [47]:
df.groupby('sex')['tip'].describe().T

sex,Female,Male
count,87.0,157.0
mean,2.833448,3.089618
std,1.159495,1.489102
min,1.0,1.0
25%,2.0,2.0
50%,2.75,3.0
75%,3.5,3.76
max,6.5,10.0
