ðŸŸ¦ 1. Import

In [1]:
import pandas as pd
import numpy as np

ðŸŸ¦ 2. Example DataFrame

In [2]:
df = pd.DataFrame({
    "route_id":    [10, 10, 10, 20, 20, 30, 30, 30],
    "direction":   [0, 0, 1, 0, 1, 0, 0, 1],
    "delay_min":   [3, 5, 0, 7, 3, 1, 4, 2],
    "trip_duration":[25, 30, 28, 45, 50, 20, 35, 22],
    "score":       [0.80, 0.60, 0.90, 0.50, 0.70, 0.95, 0.40, 0.85]
})

df


Unnamed: 0,route_id,direction,delay_min,trip_duration,score
0,10,0,3,25,0.8
1,10,0,5,30,0.6
2,10,1,0,28,0.9
3,20,0,7,45,0.5
4,20,1,3,50,0.7
5,30,0,1,20,0.95
6,30,0,4,35,0.4
7,30,1,2,22,0.85


ðŸŸ¦ 3. Using transform()

In [3]:
# Mean delay for each route, repeated for each row of that route
df["mean_delay_by_route"] = df.groupby("route_id")["delay_min"].transform("mean")
df

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route
0,10,0,3,25,0.8,2.666667
1,10,0,5,30,0.6,2.666667
2,10,1,0,28,0.9,2.666667
3,20,0,7,45,0.5,5.0
4,20,1,3,50,0.7,5.0
5,30,0,1,20,0.95,2.333333
6,30,0,4,35,0.4,2.333333
7,30,1,2,22,0.85,2.333333


In [6]:
# Z-score normalization within each route
df["delay_zscore"] = df["delay_min"] - df.groupby("route_id")["delay_min"].transform("mean")
df["delay_zscore"] /= df.groupby("route_id")["delay_min"].transform("std")
df

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore
0,10,0,3,25,0.8,2.666667,0.132453
1,10,0,5,30,0.6,2.666667,0.927173
2,10,1,0,28,0.9,2.666667,-1.059626
3,20,0,7,45,0.5,5.0,0.707107
4,20,1,3,50,0.7,5.0,-0.707107
5,30,0,1,20,0.95,2.333333,-0.872872
6,30,0,4,35,0.4,2.333333,1.091089
7,30,1,2,22,0.85,2.333333,-0.218218


ðŸŸ¦ 4. Using filter()

In [7]:
# Keep only routes with more than 2 trips
df_filtered = df.groupby("route_id").filter(lambda g: len(g) > 2)
df_filtered

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore
0,10,0,3,25,0.8,2.666667,0.132453
1,10,0,5,30,0.6,2.666667,0.927173
2,10,1,0,28,0.9,2.666667,-1.059626
5,30,0,1,20,0.95,2.333333,-0.872872
6,30,0,4,35,0.4,2.333333,1.091089
7,30,1,2,22,0.85,2.333333,-0.218218


In [8]:
# Keep groups where average delay > 3 minutes
df_delay_heavy = df.groupby("route_id").filter(lambda g: g["delay_min"].mean() > 3)
df_delay_heavy

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore
3,20,0,7,45,0.5,5.0,0.707107
4,20,1,3,50,0.7,5.0,-0.707107


ðŸŸ¦ 5. Using .apply() for complex group logic

In [9]:
# Compute multiple stats in a custom way for each route
def route_performance(g):
    return pd.Series({
        "n_trips": len(g),
        "avg_delay": g["delay_min"].mean(),
        "pct_long_trips": (g["trip_duration"] > 30).mean(),
        "score_range": g["score"].max() - g["score"].min()
    })

df_group_stats = df.groupby("route_id").apply(route_performance).reset_index()
df_group_stats

  df_group_stats = df.groupby("route_id").apply(route_performance).reset_index()


Unnamed: 0,route_id,n_trips,avg_delay,pct_long_trips,score_range
0,10,3.0,2.666667,0.0,0.3
1,20,2.0,5.0,1.0,0.2
2,30,3.0,2.333333,0.333333,0.55


ðŸŸ¦ 6. Group-wise ranking, sorting, or labeling

In [10]:
df["delay_rank_by_route"] = df.groupby("route_id")["delay_min"].rank(ascending=False)
df

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore,delay_rank_by_route
0,10,0,3,25,0.8,2.666667,0.132453,2.0
1,10,0,5,30,0.6,2.666667,0.927173,1.0
2,10,1,0,28,0.9,2.666667,-1.059626,3.0
3,20,0,7,45,0.5,5.0,0.707107,1.0
4,20,1,3,50,0.7,5.0,-0.707107,2.0
5,30,0,1,20,0.95,2.333333,-0.872872,3.0
6,30,0,4,35,0.4,2.333333,1.091089,1.0
7,30,1,2,22,0.85,2.333333,-0.218218,2.0


In [11]:
df["is_max_delay"] = df["delay_min"] == df.groupby("route_id")["delay_min"].transform("max")
df

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore,delay_rank_by_route,is_max_delay
0,10,0,3,25,0.8,2.666667,0.132453,2.0,False
1,10,0,5,30,0.6,2.666667,0.927173,1.0,True
2,10,1,0,28,0.9,2.666667,-1.059626,3.0,False
3,20,0,7,45,0.5,5.0,0.707107,1.0,True
4,20,1,3,50,0.7,5.0,-0.707107,2.0,False
5,30,0,1,20,0.95,2.333333,-0.872872,3.0,False
6,30,0,4,35,0.4,2.333333,1.091089,1.0,True
7,30,1,2,22,0.85,2.333333,-0.218218,2.0,False


ðŸŸ¦ 7. Combining filter + transform + apply

In [12]:
def long_trip_ratio(g):
    return (g["trip_duration"] > 30).mean()

# Step 1: filter eligible routes
eligible = df.groupby("route_id").filter(lambda g: long_trip_ratio(g) >= 0.4)

# Step 2: normalize delay per route
eligible["delay_norm"] = eligible["delay_min"] / eligible.groupby("route_id")["delay_min"].transform("max")
eligible

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore,delay_rank_by_route,is_max_delay,delay_norm
3,20,0,7,45,0.5,5.0,0.707107,1.0,True,1.0
4,20,1,3,50,0.7,5.0,-0.707107,2.0,False,0.428571


ðŸŸ¦ 8. Broadcasting complex group results back to DataFrame

In [13]:
# Compute per-route IQR of delay and attach it back to each row
def iqr(x):
    return x.quantile(0.75) - x.quantile(0.25)

df["delay_iqr"] = df.groupby("route_id")["delay_min"].transform(iqr)
df

Unnamed: 0,route_id,direction,delay_min,trip_duration,score,mean_delay_by_route,delay_zscore,delay_rank_by_route,is_max_delay,delay_iqr
0,10,0,3,25,0.8,2.666667,0.132453,2.0,False,2.5
1,10,0,5,30,0.6,2.666667,0.927173,1.0,True,2.5
2,10,1,0,28,0.9,2.666667,-1.059626,3.0,False,2.5
3,20,0,7,45,0.5,5.0,0.707107,1.0,True,2.0
4,20,1,3,50,0.7,5.0,-0.707107,2.0,False,2.0
5,30,0,1,20,0.95,2.333333,-0.872872,3.0,False,1.5
6,30,0,4,35,0.4,2.333333,1.091089,1.0,True,1.5
7,30,1,2,22,0.85,2.333333,-0.218218,2.0,False,1.5


## ðŸŸ¦ Summary of Advanced GroupBy Tools

| Method         | Output shape            | Best for                 |
| -------------- | ----------------------- | ------------------------ |
| `.agg()`       | One row per group       | summaries                |
| `.transform()` | Same shape as original  | broadcasting values back |
| `.filter()`    | Subset of rows          | keeping/dropping groups  |
| `.apply()`     | Flexible (row or group) | custom logic             |

---
