In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('mpg')
penguins = sns.load_dataset('penguins')

In [2]:
# aggregate a column

print(df['acceleration'].mean())
print(df["acceleration"].median())

15.568090452261307
15.5


In [4]:
# custom aggregate function

import numpy as np

def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

print(df[['acceleration','mpg','horsepower']].agg([iqr, 'median']))

        acceleration   mpg  horsepower
iqr             3.35  11.5        51.0
median         15.50  23.0        93.5


In [5]:
# cumulative statistics

df_cum = df.copy()
df_cum['cum_weight'] = df_cum['weight'].cumsum()
df_cum['max_weight'] = df_cum['weight'].cummax()
print(df_cum[['name', 'weight', 'cum_weight', 'max_weight']])

                          name  weight  cum_weight  max_weight
0    chevrolet chevelle malibu    3504        3504        3504
1            buick skylark 320    3693        7197        3693
2           plymouth satellite    3436       10633        3693
3                amc rebel sst    3433       14066        3693
4                  ford torino    3449       17515        3693
..                         ...     ...         ...         ...
393            ford mustang gl    2790     1172459        5140
394                  vw pickup    2130     1174589        5140
395              dodge rampage    2295     1176884        5140
396                ford ranger    2625     1179509        5140
397                 chevy s-10    2720     1182229        5140

[398 rows x 4 columns]


In [7]:
# drop duplicates

peng = penguins.drop_duplicates(['species','island'])
print(peng)

       species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen            39.1           18.7              181.0   
20      Adelie     Biscoe            37.8           18.3              174.0   
30      Adelie      Dream            39.5           16.7              178.0   
152  Chinstrap      Dream            46.5           17.9              192.0   
220     Gentoo     Biscoe            46.1           13.2              211.0   

     body_mass_g     sex  
0         3750.0    Male  
20        3400.0  Female  
30        3250.0  Female  
152       3500.0  Female  
220       4500.0  Female  


In [8]:
# counting

count = penguins['species'].value_counts()
print(count)

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64


In [9]:
# group by a column

group = penguins.groupby('species')['body_mass_g'].sum()
print(group)

species
Adelie       558800.0
Chinstrap    253850.0
Gentoo       624350.0
Name: body_mass_g, dtype: float64


In [11]:
# multiple statistics for each column

group = penguins.groupby('species')[['body_mass_g','flipper_length_mm']].agg(['min','max','mean','median'])
print(group)

          body_mass_g                              flipper_length_mm         \
                  min     max         mean  median               min    max   
species                                                                       
Adelie         2850.0  4775.0  3700.662252  3700.0             172.0  210.0   
Chinstrap      2700.0  4800.0  3733.088235  3700.0             178.0  212.0   
Gentoo         3950.0  6300.0  5076.016260  5000.0             203.0  231.0   

                              
                 mean median  
species                       
Adelie     189.953642  190.0  
Chinstrap  195.823529  196.0  
Gentoo     217.186992  216.0  
