In [42]:
import seaborn as sns
import pandas as pd
import numpy as np

In [43]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# GROUPBY

In [44]:
# After creating a GroupBy object, three types of operations can be performed on them
tips_gb = tips.groupby(["sex", "smoker"])
tips_gb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F1FE0810D0>

# AGG

The aggregate operation aggregates all the data in these groups into one value. You use a dictionary to specify which values you'd like. For example look below, we are aksing for both the mean and the min value of the tip column for each group:
1. mean(): Compute mean of groups
2. sum(): Compute sum of group values
3. size(): Compute group sizes
4. count(): Compute count of group
5. std(): Standard deviation of groups
6. var(): Compute variance of groups
7. sem(): Standard error of the mean of groups
8. describe(): Generates descriptive statistics
9. first(): Compute first of group values
10. last(): Compute last of group values
11. nth() : Take nth value, or a subset if n is a list
12. min(): Compute min of group values
13. max(): Compute max of group values

In [45]:
# aggregate
tips_agg = tips_gb.agg({
    "tip" : ["mean", "min", "size"],
    "day" : ["first", "last"],
    "total_bill" : ["size", "max", "sum"]
})

tips_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,day,day,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,size,first,last,size,max,sum
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Male,Yes,3.051167,1.0,60,Sat,Sat,60,50.81,1337.07
Male,No,3.113402,1.25,97,Sun,Sat,97,48.33,1919.75
Female,Yes,2.931515,1.0,33,Sat,Sat,33,44.3,593.27
Female,No,2.773519,1.0,54,Sun,Thur,54,35.83,977.68


In [46]:
tips_agg.reset_index()

Unnamed: 0_level_0,sex,smoker,tip,tip,tip,day,day,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,size,first,last,size,max,sum
0,Male,Yes,3.051167,1.0,60,Sat,Sat,60,50.81,1337.07
1,Male,No,3.113402,1.25,97,Sun,Sat,97,48.33,1919.75
2,Female,Yes,2.931515,1.0,33,Sat,Sat,33,44.3,593.27
3,Female,No,2.773519,1.0,54,Sun,Thur,54,35.83,977.68


In [47]:
tips_agg.columns

MultiIndex([(       'tip',  'mean'),
            (       'tip',   'min'),
            (       'tip',  'size'),
            (       'day', 'first'),
            (       'day',  'last'),
            ('total_bill',  'size'),
            ('total_bill',   'max'),
            ('total_bill',   'sum')],
           )

In [48]:
tips_agg.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,day,tip,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,Yes,first,Sat,,
Male,Yes,last,Sat,,
Male,Yes,max,,,50.81
Male,Yes,mean,,3.051167,
Male,Yes,min,,1.0,
Male,Yes,size,,60.0,60.0
Male,Yes,sum,,,1337.07
Male,No,first,Sun,,
Male,No,last,Sat,,
Male,No,max,,,48.33


In [49]:
tips_agg.columns.values

array([('tip', 'mean'), ('tip', 'min'), ('tip', 'size'), ('day', 'first'),
       ('day', 'last'), ('total_bill', 'size'), ('total_bill', 'max'),
       ('total_bill', 'sum')], dtype=object)

In [50]:
"__".join(tips_agg.columns.values[0])

'tip__mean'

In [51]:
"__".join(tips_agg.columns.values[0]).strip()

'tip__mean'

In [52]:
tips_agg.columns = ["__".join(col).strip() for col in tips_agg.columns.values]
tips_agg.columns

Index(['tip__mean', 'tip__min', 'tip__size', 'day__first', 'day__last',
       'total_bill__size', 'total_bill__max', 'total_bill__sum'],
      dtype='object')

In [53]:
tips_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,tip__mean,tip__min,tip__size,day__first,day__last,total_bill__size,total_bill__max,total_bill__sum
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Male,Yes,3.051167,1.0,60,Sat,Sat,60,50.81,1337.07
Male,No,3.113402,1.25,97,Sun,Sat,97,48.33,1919.75
Female,Yes,2.931515,1.0,33,Sat,Sat,33,44.3,593.27
Female,No,2.773519,1.0,54,Sun,Thur,54,35.83,977.68


# FILTER

In [55]:
tips_gb = tips.groupby(["day", "time"])

In [58]:
tips_gb.agg({"size":"sum"}).median()

size    88.0
dtype: float64

In [59]:
median_size = tips_gb.agg({"size":"sum"}).median()[0]

In [63]:
median_size

88.0

In [65]:
tips_gb["size"].sum()

day   time  
Thur  Lunch     150.0
      Dinner      2.0
Fri   Lunch      14.0
      Dinner     26.0
Sat   Lunch       NaN
      Dinner    219.0
Sun   Lunch       NaN
      Dinner    216.0
Name: size, dtype: float64

In [66]:
tips_gb.filter(lambda group: group["size"].sum() < median_size)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
90,28.97,3.0,Male,Yes,Fri,Dinner,2
91,22.49,3.5,Male,No,Fri,Dinner,2
92,5.75,1.0,Female,Yes,Fri,Dinner,2
93,16.32,4.3,Female,Yes,Fri,Dinner,2
94,22.75,3.25,Female,No,Fri,Dinner,2
95,40.17,4.73,Male,Yes,Fri,Dinner,4
96,27.28,4.0,Male,Yes,Fri,Dinner,2
97,12.03,1.5,Male,Yes,Fri,Dinner,2
98,21.01,3.0,Male,Yes,Fri,Dinner,2
99,12.46,1.5,Male,No,Fri,Dinner,2


In [68]:
tips_gb.filter(lambda group: group["total_bill"].sum() < 100)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
220,12.16,2.2,Male,Yes,Fri,Lunch,2
221,13.42,3.48,Female,Yes,Fri,Lunch,2
222,8.58,1.92,Male,Yes,Fri,Lunch,1
223,15.98,3.0,Female,No,Fri,Lunch,3
224,13.42,1.58,Male,Yes,Fri,Lunch,2
225,16.27,2.5,Female,Yes,Fri,Lunch,2
226,10.09,2.0,Female,Yes,Fri,Lunch,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [74]:
tips_gb.filter(lambda group: group["total_bill"].max() < 25)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
220,12.16,2.2,Male,Yes,Fri,Lunch,2
221,13.42,3.48,Female,Yes,Fri,Lunch,2
222,8.58,1.92,Male,Yes,Fri,Lunch,1
223,15.98,3.0,Female,No,Fri,Lunch,3
224,13.42,1.58,Male,Yes,Fri,Lunch,2
225,16.27,2.5,Female,Yes,Fri,Lunch,2
226,10.09,2.0,Female,Yes,Fri,Lunch,2
243,18.78,3.0,Female,No,Thur,Dinner,2


# TRANSFORM

In [81]:
tips_gb = tips.groupby(["day"])

In [79]:
tips.groupby(["day"]).mean()

Unnamed: 0_level_0,total_bill,tip,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,17.682742,2.771452,2.451613
Fri,17.151579,2.734737,2.105263
Sat,20.441379,2.993103,2.517241
Sun,21.41,3.255132,2.842105


In [82]:
tips_gb[["total_bill", "tip"]].transform(lambda x : x / x.mean()).head()

Unnamed: 0,total_bill,tip
0,0.793554,0.310279
1,0.482952,0.509964
2,0.981317,1.075225
3,1.106025,1.016856
4,1.148529,1.109018


In [84]:
tips_gb[["total_bill"]].transform(lambda x : x / x.mean() * 10).head()

Unnamed: 0,total_bill
0,7.935544
1,4.829519
2,9.813171
3,11.060252
4,11.485287
