In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# --- 1. Prepare Sample Data ---
data = {
    'Store': ['A', 'B', 'A', 'C', 'B', 'C', 'A', 'B', 'C'],
    'Product': ['Apple', 'Orange', 'Banana', 'Apple', 'Banana', 'Orange', 'Apple', 'Orange', 'Banana'],
    'Sales': [100, 150, 80, 200, 120, 90, 110, 160, 70],
    'Quantity': [10, 15, 8, 20, 12, 9, 11, 16, 7],
    'Region': ['North', 'North', 'South', 'South', 'North', 'South', 'North', 'North', 'South']
}
df = pd.DataFrame(data)

print("--- Sample DataFrame ---")
print(df)
print("-" * 30)


# --- 2. The Split-Apply-Combine Strategy ---
# 1. Split: Divide the data into groups based on specified keys (e.g., 'Store', 'Region').
# 2. Apply: Apply a function to each group independently (e.g., sum(), mean(), count(), custom function).
# 3. Combine: Combine the results of the function application into a new data structure (Series or DataFrame).


# --- 3. Grouping (`.groupby()`) ---
# Creates a GroupBy object, which holds information about the groups.
# No computation is done until an action (aggregation, transformation, filter) is called.

print("--- Grouping ---")

# Group by a single column ('Store')
grouped_by_store = df.groupby('Store')
print("GroupBy object (grouped by 'Store'):\n", grouped_by_store) # Shows the object type

# Group by multiple columns ('Region', 'Store')
grouped_by_region_store = df.groupby(['Region', 'Store'])
print("\nGroupBy object (grouped by 'Region', 'Store'):\n", grouped_by_region_store)

# Iterating through groups (useful for understanding)
print("\nIterating through groups ('Store'):")
for name, group_df in grouped_by_store:
    print(f"\nGroup Name (Store): {name}")
    print(group_df) # group_df is a DataFrame containing only rows for that group
print("-" * 20)

# Selecting a column after grouping
# This creates a SeriesGroupBy object, ready for aggregation on that specific column
sales_grouped_by_store = grouped_by_store['Sales']
print("\nSeriesGroupBy object (grouped 'Sales' by 'Store'):\n", sales_grouped_by_store)
print("-" * 30)


# --- 4. Aggregation ---
# Applying functions that reduce each group to a single summary value.

print("--- Aggregation ---")

# a) Using built-in aggregation methods directly on the GroupBy object
# Applies the function to all valid (usually numerical) columns

# Calculate sum for each store
print("Sum per Store (applied to all numerical columns):\n", grouped_by_store.sum())

# Calculate mean for each store
print("\nMean per Store:\n", grouped_by_store.mean()) # Deprecation warning might appear, use numeric_only=True if needed

# Calculate mean for each store (explicitly selecting numeric columns)
print("\nMean per Store (numeric_only=True):\n", grouped_by_store.mean(numeric_only=True))


# Calculate size of each group (includes NaN, returns Series)
print("\nSize per Store (.size()):\n", grouped_by_store.size())

# Calculate count of non-NA values in each column per group (returns DataFrame)
print("\nCount per Store (.count()):\n", grouped_by_store.count())

# Other common methods: .median(), .std(), .var(), .min(), .max(), .first(), .last(), .nunique()
print("\nMax Sales per Store:\n", grouped_by_store['Sales'].max()) # Aggregate on a specific column
print("\nNumber of unique products per Store:\n", grouped_by_store['Product'].nunique())

# Aggregation with multiple grouping keys
print("\nMean Sales & Quantity per Region and Store:\n", grouped_by_region_store[['Sales', 'Quantity']].mean())
print("-" * 20)

# b) Using the .agg() or .aggregate() method for more flexibility
# Apply multiple aggregation functions at once
agg_funcs = ['sum', 'mean', 'count']
agg_results = grouped_by_store['Sales'].agg(agg_funcs)
print(f"Multiple aggregations on 'Sales' ({agg_funcs}):\n", agg_results)

# Apply different functions to different columns using a dictionary
agg_dict = {
    'Sales': ['sum', 'mean'],       # Apply sum and mean to Sales
    'Quantity': 'sum',              # Apply sum to Quantity
    'Product': 'nunique'            # Apply nunique to Product
}
agg_dict_results = grouped_by_store.agg(agg_dict)
print("\nDifferent aggregations per column using .agg(dict):\n", agg_dict_results)

# Rename aggregated columns using tuples in the dictionary
agg_dict_rename = {
    'Sales': ('TotalSales', 'sum'),
    'Quantity': ('AvgQuantity', 'mean'),
    'Product': ('UniqueProducts', 'nunique')
}
try:
    # This syntax requires pandas >= 1.5 (approximately)
     agg_dict_rename_results = grouped_by_store.agg(
         TotalSales=pd.NamedAgg(column='Sales', aggfunc='sum'),
         AvgQuantity=pd.NamedAgg(column='Quantity', aggfunc='mean'),
         UniqueProducts=pd.NamedAgg(column='Product', aggfunc='nunique')
     )
     print("\nAggregations with renamed columns using NamedAgg:\n", agg_dict_rename_results)
except AttributeError:
     # Older pandas versions might use a different tuple syntax (less explicit)
     # This older syntax might be deprecated or removed in future versions.
     try:
         agg_dict_rename_results_old = grouped_by_store.agg(**agg_dict_rename)
         print("\nAggregations with renamed columns (older syntax):\n", agg_dict_rename_results_old)
     except Exception as e:
         print(f"\nCould not use older rename syntax either: {e}")

# Apply custom aggregation functions
def peak_to_peak(arr):
    return arr.max() - arr.min()

agg_custom = grouped_by_store['Sales'].agg(['sum', peak_to_peak])
print("\nAggregation with custom function (peak_to_peak):\n", agg_custom)
print("-" * 30)


# --- 5. Transformation (`.transform()`) ---
# Applies a function group-wise but returns a result with the *same index* as the original DataFrame.
# Useful for standardizing data within groups or filling missing values group-wise.

print("--- Transformation (.transform()) ---")

# Example: Standardize Sales within each Store (z-score)
# z = (x - mean) / std
zscore = lambda x: (x - x.mean()) / x.std()
df['Sales_ZScore_Store'] = grouped_by_store['Sales'].transform(zscore)
print("DataFrame with Sales Z-Score calculated per Store:\n", df[['Store', 'Sales', 'Sales_ZScore_Store']])

# Example: Fill missing values with the group mean (imagine 'Sales' had NaNs)
# df['Sales_Filled'] = df.groupby('Store')['Sales'].transform(lambda x: x.fillna(x.mean()))
print("-" * 30)


# --- 6. Filtering (`.filter()`) ---
# Keeps or discards entire groups based on a boolean condition evaluated per group.
# The function passed to .filter() should return True (keep group) or False (discard group).

print("--- Filtering (.filter()) ---")

# Keep only stores where the total sales are greater than 300
df_filtered_sales = grouped_by_store.filter(lambda group: group['Sales'].sum() > 300)
print("Filtered DataFrame (only groups where sum(Sales) > 300):\n", df_filtered_sales)

# Keep only stores that sold more than 1 unique product type
df_filtered_products = grouped_by_store.filter(lambda group: group['Product'].nunique() > 1)
print("\nFiltered DataFrame (only groups with > 1 unique product):\n", df_filtered_products)

# Keep only regions where the average quantity is >= 10
df_filtered_region_qty = grouped_by_region_store.filter(lambda group: group['Quantity'].mean() >= 10)
print("\nFiltered DataFrame (only Region/Store groups where mean(Quantity) >= 10):\n", df_filtered_region_qty)
print("-" * 30)

--- Sample DataFrame ---
  Store Product  Sales  Quantity Region
0     A   Apple    100        10  North
1     B  Orange    150        15  North
2     A  Banana     80         8  South
3     C   Apple    200        20  South
4     B  Banana    120        12  North
5     C  Orange     90         9  South
6     A   Apple    110        11  North
7     B  Orange    160        16  North
8     C  Banana     70         7  South
------------------------------
--- Grouping ---
GroupBy object (grouped by 'Store'):
 <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F6A7978830>

GroupBy object (grouped by 'Region', 'Store'):
 <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F6A7932D50>

Iterating through groups ('Store'):

Group Name (Store): A
  Store Product  Sales  Quantity Region
0     A   Apple    100        10  North
2     A  Banana     80         8  South
6     A   Apple    110        11  North

Group Name (Store): B
  Store Product  Sales  Quantity Region
1    

TypeError: agg function failed [how->mean,dtype->object]