In [1]:
import pandas as pd
import numpy as np

In [4]:
np.random.seed(0)
df = pd.DataFrame({
    'name': list('ABCDEFGHIJ'),
    'year': [2018, 2019, 2020] * 3 + [2021],
    'score':np.random.randint(80, 100, 10)
    })
df

Unnamed: 0,name,year,score
0,A,2018,92
1,B,2019,95
2,C,2020,80
3,D,2018,83
4,E,2019,83
5,F,2020,87
6,G,2018,89
7,H,2019,99
8,I,2020,98
9,J,2021,84


In [5]:
df['score'].mean()

np.float64(89.0)

In [6]:
df.loc[df['score'] > 90]

Unnamed: 0,name,year,score
0,A,2018,92
1,B,2019,95
7,H,2019,99
8,I,2020,98


In [7]:
df.groupby('year')['score'].mean()

year
2018    88.000000
2019    92.333333
2020    88.333333
2021    84.000000
Name: score, dtype: float64

In [8]:
df.shape

(10, 3)

So far, so good. But consider this: we want to determine which years in our school had an average score of at least 90, and ``see all the students in those years.`` We want to filter out specific groups of students **based on a per-year aggregate calculation**. How can we do that?

The answer, it turns out, is to apply the filter method to our DataFrameGroupBy object. All we need is to pass filter a function that, given a group of rows, returns either True or False, to indicate whether those rows should be in the result data frame.

In [13]:
# using lambda function
df.groupby('year').filter(lambda x: x['score'].mean() > 90)

Unnamed: 0,name,year,score
1,B,2019,95
4,E,2019,83
7,H,2019,99


In [14]:
# using full-fledged python function
def year_average_is_at_least_90(df):
    return df['score'].mean() > 90
df.groupby('year').filter(year_average_is_at_least_90)

Unnamed: 0,name,year,score
1,B,2019,95
4,E,2019,83
7,H,2019,99


The result of running this code is a data frame whose rows all come from df, from years in which the average final-exam math score was at least 90.

In [21]:
df.groupby('year')['score'].transform(lambda x: x/100)

0    0.92
1    0.95
2    0.80
3    0.83
4    0.83
5    0.87
6    0.89
7    0.99
8    0.98
9    0.84
Name: score, dtype: float64

In [22]:
df['score'] = (
    df.groupby('year')['score']
    .transform(lambda x: x/100)
    )

In [23]:
df

Unnamed: 0,name,year,score
0,A,2018,0.92
1,B,2019,0.95
2,C,2020,0.8
3,D,2018,0.83
4,E,2019,0.83
5,F,2020,0.87
6,G,2018,0.89
7,H,2019,0.99
8,I,2020,0.98
9,J,2021,0.84


But we can do much more than this. After all, our lambda function has access to all the rows from each year. ``This means we can run aggregate functions, such as sum or mean.``

In [26]:
df.groupby('year')['score'].transform('max')

0    0.92
1    0.99
2    0.98
3    0.92
4    0.99
5    0.98
6    0.92
7    0.99
8    0.98
9    0.84
Name: score, dtype: float64

In [28]:
df = df.assign(max_value_per_year = df.groupby('year')['score'].transform('max'))
df

Unnamed: 0,name,year,score,max_value_per_year
0,A,2018,0.92,0.92
1,B,2019,0.95,0.99
2,C,2020,0.8,0.98
3,D,2018,0.83,0.92
4,E,2019,0.83,0.99
5,F,2020,0.87,0.98
6,G,2018,0.89,0.92
7,H,2019,0.99,0.99
8,I,2020,0.98,0.98
9,J,2021,0.84,0.84


In [29]:
df

Unnamed: 0,name,year,score,max_value_per_year
0,A,2018,0.92,0.92
1,B,2019,0.95,0.99
2,C,2020,0.8,0.98
3,D,2018,0.83,0.92
4,E,2019,0.83,0.99
5,F,2020,0.87,0.98
6,G,2018,0.89,0.92
7,H,2019,0.99,0.99
8,I,2020,0.98,0.98
9,J,2021,0.84,0.84
