# 🧩 Notebook 06:Advanced pandas Techniques: Window Functions, Method Chaining & Performance Optimization

In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

## 🧭 1. Setup & Data Loading

In [2]:
# Standard setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

# Standard Imports
import pandas as pd
import numpy as np

from scripts import utils_io, agg_utils

# Load cleaned datasets
superstore = utils_io.load_csv("../assets/superstore_final.csv")
covid = utils_io.load_csv("../assets/covid_final.csv")
weather = utils_io.load_csv("../assets/weather_final.csv")

# Ensure datetime conversion
superstore["order_date"] = pd.to_datetime(superstore["order_date"])
covid["date"] = pd.to_datetime(covid["date"])
weather["date"] = pd.to_datetime(weather["date"])

print("✅ Data loaded and prepped for advanced operations.")

✅ Data loaded and prepped for advanced operations.


## 📦 1. Grouped Window Functions
### 🔄 1.1 Rolling Average of Profit by Region

In [3]:
# Rolling avg of profit (3-month window) within each region
superstore["month"] = superstore["order_date"].dt.to_period("M").astype(str)

region_monthly = (
    superstore
    .groupby(["region", "month"], as_index=False)
    .agg({"profit": "sum"})
    .sort_values(["region", "month"])
)

region_monthly["rolling_profit"] = (
    region_monthly
    .groupby("region")["profit"]
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
)

region_monthly.head()

Unnamed: 0,region,month,profit,rolling_profit
0,central,2020-01,431.08,431.08
1,central,2020-02,544.76,487.92
2,central,2020-03,61.89,345.91
3,central,2020-04,4.47,203.706667
4,central,2020-05,271.01,112.456667


### 📊 1.2 Rank Customers by Sales Within Each Region

In [4]:
# Rank customers based on total sales within their region
customer_rank = (
    superstore
    .groupby(["region", "customer_id"], as_index=False)
    .agg({"sales": "sum"})
)

customer_rank["sales_rank"] = (
    customer_rank
    .groupby("region")["sales"]
    .rank(ascending=False)
)

customer_rank.sort_values(["region", "sales_rank"]).head()

Unnamed: 0,region,customer_id,sales,sales_rank
1637,central,cust-7936,4421.55,1.0
1799,central,cust-8541,3978.62,2.0
1645,central,cust-7972,3929.56,3.0
1398,central,cust-6931,3894.62,4.0
29,central,cust-1158,3856.07,5.0


## 🧪 2. Custom Grouped Evaluations with `grouped_eval()`
### 🧮 2.1 Percent Change in Monthly Sales by Category

In [5]:
superstore["month"] = superstore["order_date"].dt.to_period("M").astype(str)

category_monthly = (
    superstore
    .groupby(["category", "month"], as_index=False)
    .agg({"sales": "sum"})
    .sort_values(["category", "month"])
)

category_monthly = agg_utils.grouped_eval(
    df=category_monthly,
    group_cols=["category"],
    target_col="sales",
    new_col="pct_change",
    func=lambda x: x.pct_change()
)

category_monthly.head()


Unnamed: 0,category,month,sales,pct_change
0,furniture,2020-01,19001.35,
1,furniture,2020-02,3277.89,-0.827492
2,furniture,2020-03,13562.83,3.137671
3,furniture,2020-04,8463.34,-0.37599
4,furniture,2020-05,3320.68,-0.60764


### 📈 2.2 Rank States by Monthly Sales Growth

In [6]:
# Region-level monthly aggregation
superstore["month"] = superstore["order_date"].dt.to_period("M").astype(str)

region_monthly = (
    superstore
    .groupby(["region", "month"], as_index=False)
    .agg({"sales": "sum"})
    .sort_values(["region", "month"])
)

region_monthly = agg_utils.grouped_eval(
    df=region_monthly,
    group_cols=["region"],
    target_col="sales",
    new_col="sales_growth_rank",
    func=lambda x: x.pct_change().rank(ascending=False)
)

region_monthly.head()

Unnamed: 0,region,month,sales,sales_growth_rank
0,central,2020-01,8366.32,
1,central,2020-02,6631.29,214.0
2,central,2020-03,6689.8,159.0
3,central,2020-04,269.01,326.0
4,central,2020-05,6252.0,2.0


## 🔗 3. Method Chaining Tricks
### 🔧 3.1 Filter + Transform + Rank in One Chain

In [7]:
# Method chaining example — customers with high recent purchases
top_recent_customers = (
    superstore
    .query("order_date >= '2022-01-01'")
    .groupby("customer_id")
    .agg(total_sales=("sales", "sum"), avg_profit=("profit", "mean"))
    .assign(sales_rank=lambda df: df["total_sales"].rank(ascending=False))
    .sort_values("sales_rank")
    .reset_index()
)

top_recent_customers.head()

Unnamed: 0,customer_id,total_sales,avg_profit,sales_rank
0,cust-5096,7909.22,138.178,1.0
1,cust-8502,7369.68,64.098,2.0
2,cust-6306,6894.61,65.821667,3.0
3,cust-6569,6889.29,64.252,4.0
4,cust-2041,6715.43,26.276667,5.0


### 📌 3.2 Pipe for Clean Logic Composition

In [8]:
def top_customers(df, n=5):
    return df.nlargest(n, "total_sales")

(
    superstore
    .groupby("customer_id")
    .agg(total_sales=("sales", "sum"))
    .pipe(top_customers, n=5)
)

Unnamed: 0_level_0,total_sales
customer_id,Unnamed: 1_level_1
cust-5096,7909.22
cust-8986,7385.57
cust-8502,7369.68
cust-1378,7109.54
cust-6306,6894.61


## 🚨 4. Defensive Patterns & Edge Case Handling
### ⚠️ 4.1 Handling Missing Dates in Time Series

In [9]:
# Resample + fill missing dates (forward fill)
superstore.set_index("order_date", inplace=True)

resampled = (
    superstore
    .resample("D")[["sales", "profit"]]
    .sum()
    .fillna(0)
)

resampled.head()

Unnamed: 0_level_0,sales,profit
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,1292.63,22.94
2020-01-02,1947.16,205.52
2020-01-03,1774.42,23.84
2020-01-04,591.01,64.85
2020-01-05,1969.55,58.99


## 📌 5. Summary
### ✅ Key Takeaways from Advanced Pandas

- **Grouped window functions** unlock trend and comparative analysis across segments.
- `grouped_eval()` simplifies applying custom transforms like `rank()`, `pct_change()` within groups.
- **Method chaining** with `.assign()` and `.pipe()` creates elegant, modular code blocks.
- Use **resample + fillna** techniques to handle gaps in time series.
- These patterns build a strong foundation for pandas mastery and real-world pipeline construction.

## 