In [None]:
import pandas as pd

# Key Concept: GroupBy Operations in pandas

`groupby()` is pandas' equivalent to SQL's **GROUP BY**.  
It's one of the most powerful and commonly used tools for data analysis.

## Why use GroupBy?

- Calculate metrics **per customer**, **product**, **region**, **time period**, etc.
- Build **aggregation tables** for data warehouses
- Create **summary reports** from raw transaction data
- Prepare data for **dashboards** and **analytics**

## The Classic Pattern: Split → Apply → Combine

1. **Split** the data into groups based on one or more column values  
2. **Apply** aggregation functions (e.g., `sum`, `count`, `mean`, `min`, `max`, ...) to each group  
3. **Combine** the results into a new DataFrame

## Basic Syntax

```python
df.groupby(by=columns_to_group_by).agg_function()
# or
df.groupby(columns_to_group_by)[columns_to_aggregate].agg_function()

In [None]:
import pandas as pd
import numpy as np

# Sample sales data
data = {
    'order_id': range(1, 11),
    'customer_id': [101, 102, 101, 103, 102, 101, 104, 103, 102, 101],
    'product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Laptop',
                'Phone', 'Tablet', 'Phone', 'Tablet', 'Laptop'],
    'quantity': [1, 2, 1, 1, 1, 3, 2, 1, 1, 2],
    'amount': [1200, 800, 500, 1200, 1200, 2400, 1000, 800, 500, 2400],
    'order_date': pd.date_range('2025-01-01', periods=10, freq='D')
}

df = pd.DataFrame(data)
print("ORIGINAL DATA:")
print(df)

# # Basic groupby: Total sales per customer
sales_per_customer = df.groupby('customer_id')['amount'].sum()
print("\nTotal sales per customer:")
print(sales_per_customer)

# # Multiple aggregations at once
customer_summary = df.groupby('customer_id').agg({
    'amount': 'sum',
    'order_id': 'count',  # Number of orders
    'quantity': 'mean'     # Avg items per order
})
customer_summary.columns = ['total_sales', 'num_orders', 'avg_quantity']
print("\nCustomer summary:")
print(customer_summary)


In [None]:
# Group by multiple columns: Product sales by customer
product_customer = df.groupby(['customer_id', 'product']).agg({
    'amount': 'sum',
    'quantity': 'sum'
}).reset_index()  # Convert index back to columns


print("\nProduct sales by customer:")
print(product_customer)

# Add calculated columns
product_customer['avg_price'] = (
    product_customer['amount'] / product_customer['quantity']
)
print("\nWith average price:")
print(product_customer)

In [None]:
# Multiple aggregations on same column
advanced_summary = df.groupby('customer_id')['amount'].agg([
    'sum',
    'mean',
    'min',
    'max',
    'count',
    ('std_dev', 'std')  # Custom column name
]).round(2)

print("\nAdvanced customer metrics:")
print(advanced_summary)

# Custom aggregation function
def sales_range(x):
    """Calculate difference between max and min sale"""
    return x.max() - x.min()

custom_agg = df.groupby('customer_id')['amount'].agg([
    ('total', 'sum'),
    ('range', sales_range),
    ('unique_products', lambda x: df.loc[x.index, 'product'].nunique())
])
print("\nCustom aggregations:")
print(custom_agg)

In [None]:
# Add month column for grouping
# df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
# df['month'] = df['order_date'].dt.to_period('M')
df['month'] = df['order_date'].dt.to_period(freq='M')

# Monthly sales trend
monthly_sales = df.groupby('month').agg({
    'amount': 'sum',
    'order_id': 'count'
}).rename(columns={'amount': 'revenue', 'order_id': 'orders'})

print("\nMonthly sales trend:")
print(monthly_sales)

# Week-over-week comparison
df['week'] = df['order_date'].dt.to_period('W')
weekly = df.groupby('week')['amount'].sum()
weekly_pct_change = weekly.pct_change() * 100

print("\nWeek-over-week growth %:")
print(weekly_pct_change.round(2))