In [12]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from IPython.display import display, Markdown

# --- 1. Configuration ---
print("1. Setting up configuration...")

1. Setting up configuration...


In [13]:
# Data Simulation
NUM_USERS = 100000
TODAY = datetime(2025, 9, 21)

# Universe Definition
UNIVERSE_DAYS = 7

# Segment Constraints
MIN_SEGMENT_SIZE = 500
MAX_SEGMENT_SIZE = 20000

# Optional Business-Defined Minimums for Thresholds
BUSINESS_MINIMUMS = {
    'AOV_HIGH': 2500
}

# Scoring Weights
W_CONV = 0.40; W_PROF = 0.30; W_LIFT = 0.15; W_STRAT = 0.10; W_SIZE = 0.05

In [14]:

# --- 2. Data Simulation ---
def generate_mock_data(n_users, today_date):
    print("2. Generating mock data...")
    np.random.seed(42)

    df = pd.DataFrame({
        'user_id': range(1, n_users + 1),
        'cart_abandoned_date': [today_date - timedelta(days=int(d)) for d in np.random.geometric(p=0.2, size=n_users) % 30],
        'last_order_date': [today_date - timedelta(days=int(d)) for d in np.random.randint(5, 365, size=n_users)],
        'avg_order_value': np.random.lognormal(mean=6, sigma=1, size=n_users).round(2),
        'sessions_last_30d': np.random.poisson(lam=8, size=n_users),
        'num_cart_items': np.random.randint(1, 15, size=n_users), # Added this feature
        'engagement_score': np.random.beta(a=2, b=3, size=n_users).round(4),
        'profitability_score': np.random.beta(a=3, b=2, size=n_users).round(4)
    })
    return df

In [15]:
# --- 3. Universe Definition & Hybrid Thresholds ---
def setup_universe_and_thresholds(df, days, today_date):
    print(f"3. Defining universe and calculating hybrid thresholds...")
    cutoff_date = today_date - timedelta(days=days)
    universe = df[df['cart_abandoned_date'] >= cutoff_date].copy()
    print(f"   - Universe size: {len(universe)} users")

    # HYBRID THRESHOLDS: Combine data-driven quantiles with business-defined minimums.
    thresholds = {
        'PROFITABILITY_HIGH': round(universe['profitability_score'].quantile(0.80), 2),
        'AOV_HIGH': round(max(universe['avg_order_value'].quantile(0.90), BUSINESS_MINIMUMS.get('AOV_HIGH', 0)), 2),
        'AOV_MID': round(universe['avg_order_value'].quantile(0.60), 2),
        'ENGAGEMENT_HIGH': round(universe['engagement_score'].quantile(0.80), 2),
        'SESSIONS_HIGH': int(universe['sessions_last_30d'].quantile(0.85)),
        'CART_ITEMS_HIGH': int(universe['num_cart_items'].quantile(0.90)) # New threshold
    }
    print(f"   - Calculated Hybrid Thresholds: {thresholds}")
    return universe, thresholds

In [16]:
# --- 4. MECE Segmentation with Rule Generation ---
def create_mece_segments(df, t):
    print("4. Creating MECE segments and generating rules...")

    conditions = [
        (df['num_cart_items'] > t['CART_ITEMS_HIGH']),
        (df['avg_order_value'] > t['AOV_HIGH']) & (df['engagement_score'] > t['ENGAGEMENT_HIGH']),
        (df['avg_order_value'] > t['AOV_HIGH']),
        (df['avg_order_value'] > t['AOV_MID']) & (df['profitability_score'] > t['PROFITABILITY_HIGH']),
        (df['sessions_last_30d'] > t['SESSIONS_HIGH']),
        (df['avg_order_value'] > t['AOV_MID']),
        (df['engagement_score'] > t['ENGAGEMENT_HIGH'])
    ]

    segment_names = [
        "High Intent: Large Cart", "Whales: High AOV, High Engagement", "Whales: High AOV, Other",
        "Rising Stars: Mid AOV, High Profit", "Power Shoppers: High Session Activity",
        "Prospects: Mid AOV, Other", "Engaged Potentials: Low AOV, High Engagement"
    ]

    # Generate human-readable rules alongside the logic
    rules = [
        f"num_cart_items > {t['CART_ITEMS_HIGH']}",
        f"AOV > {t['AOV_HIGH']} & engagement > {t['ENGAGEMENT_HIGH']}",
        f"AOV > {t['AOV_HIGH']}",
        f"AOV > {t['AOV_MID']} & profitability > {t['PROFITABILITY_HIGH']}",
        f"sessions > {t['SESSIONS_HIGH']}",
        f"AOV > {t['AOV_MID']}",
        f"engagement > {t['ENGAGEMENT_HIGH']}"
    ]

    df['segment'] = np.select(conditions, segment_names, default="Other Bucket")
    rules_map = {name: rule for name, rule in zip(segment_names, rules)}
    rules_map["Other Bucket"] = "ELSE"

    print("   - All users assigned to an initial segment.")
    return df, rules_map

In [17]:
# --- 5. Enforce Size Constraints ---
def enforce_size_constraints(df, rules_map):
    print("5. Enforcing segment size constraints (Min & Max)...")

    # Fold small segments
    segment_counts = df['segment'].value_counts()
    small_segments = segment_counts[segment_counts < MIN_SEGMENT_SIZE].index.tolist()
    if small_segments:
        df.loc[df['segment'].isin(small_segments), 'segment'] = 'Other Bucket'

    # Split large segments
    while True:
        segment_counts = df['segment'].value_counts()
        large_segments = segment_counts[segment_counts > MAX_SEGMENT_SIZE].index.tolist()
        if not large_segments: break

        segment_to_split = large_segments[0]
        print(f"     - Splitting '{segment_to_split}' by profitability...")
        mask = df['segment'] == segment_to_split
        median_profitability = df[mask]['profitability_score'].median()

        # Define new names and rules for the sub-segments
        name_high = f"{segment_to_split} (High Profit)"
        name_low = f"{segment_to_split} (Low Profit)"
        original_rule = rules_map.get(segment_to_split, "Complex Rule")

        # Update DataFrame and rules map
        df.loc[mask & (df['profitability_score'] >= median_profitability), 'segment'] = name_high
        df.loc[mask & (df['profitability_score'] < median_profitability), 'segment'] = name_low
        rules_map[name_high] = f"({original_rule}) AND profitability >= {median_profitability:.2f}"
        rules_map[name_low] = f"({original_rule}) AND profitability < {median_profitability:.2f}"
        del rules_map[segment_to_split]

    print("   - Constraint enforcement complete.")
    return df, rules_map

In [18]:
# --- 6. Audience Scoring ---
def compute_scores(df, universe_size):
    print("6. Computing audience scores...")
    df['recency_score'] = (1 - (TODAY - df['cart_abandoned_date']).dt.days / UNIVERSE_DAYS).clip(0, 1)
    df['conversion_potential'] = df['engagement_score'] * df['recency_score']
    df['lift_vs_control'] = 0.05 + (0.2 * df['engagement_score'])
    df['strategic_fit'] = 0.6 * df['profitability_score'] + 0.4 * df['engagement_score']

    agg_df = df.groupby('segment').agg(
        size=('user_id', 'count'), conv_pot=('conversion_potential', 'mean'),
        profitability=('profitability_score', 'mean'), lift=('lift_vs_control', 'mean'),
        strategic_fit=('strategic_fit', 'mean')
    ).reset_index()

    agg_df['size_score'] = agg_df['size'] / universe_size
    agg_df['overall_score'] = (W_CONV*agg_df['conv_pot'] + W_PROF*agg_df['profitability'] + W_LIFT*agg_df['lift'] +
                               W_STRAT*agg_df['strategic_fit'] + W_SIZE*agg_df['size_score'])
    return agg_df

In [19]:
# --- 7. Generate Deliverables ---
def create_output_files(scored_df, rules_map):
    print("7. Generating final deliverables...")
    # Add the explicit rules to the output
    rules_df = pd.DataFrame(rules_map.items(), columns=['segment', 'Rules Applied'])
    final_df = pd.merge(scored_df, rules_df, on='segment')
    final_df = final_df.sort_values(by='overall_score', ascending=False).reset_index(drop=True)

    # Reorder columns for clarity
    cols_order = ['segment', 'Rules Applied', 'size', 'overall_score', 'conv_pot', 'profitability',
                  'lift', 'strategic_fit', 'size_score']
    final_df = final_df[cols_order]

    os.makedirs("submission", exist_ok=True)
    final_df['valid'] = (final_df['size'] >= MIN_SEGMENT_SIZE) & (final_df['size'] <= MAX_SEGMENT_SIZE)
    cols_to_round = ['overall_score', 'conv_pot', 'profitability', 'lift', 'strategic_fit', 'size_score']
    final_df[cols_to_round] = final_df[cols_to_round].round(4)

    csv_path, json_path, readme_path = "submission/segment_strategy.csv", "submission/segment_strategy.json", "submission/README.md"
    final_df.to_csv(csv_path, index=False)
    final_df.to_json(json_path, orient="records", indent=2)

    readme_content = f"""
# MECE Audience Segmentation Strategy

**Generated on:** {TODAY.strftime('%Y-%m-%d')}

---

## Overview
This project segments recent cart abandoners into MECE audiences. The solution is **data-driven**, **robust**, and **highly interpretable**, automatically adapting to data and enforcing audience size constraints.

## Methodology
1.  **Universe Definition:** Users who abandoned carts in the last {UNIVERSE_DAYS} days.
2.  **Hybrid Thresholds:** Key thresholds are calculated dynamically using a hybrid approach, combining data-driven **quantiles** with optional **business-defined minimums** (e.g., AOV must be at least ${BUSINESS_MINIMUMS.get('AOV_HIGH', 0)}).
3.  **MECE Segmentation:** A decision-tree using `np.select` ensures every user is in exactly one segment. The logic now includes `num_cart_items` to identify high-intent users.
4.  **Constraint Handling:**
    -   **Min Size ({MIN_SEGMENT_SIZE}):** Small segments are folded into an 'Other Bucket'.
    -   **Max Size ({MAX_SEGMENT_SIZE}):** Oversized segments are automatically **split** by their median profitability.
5.  **Interpretable Output:** The final deliverable includes a specific **'Rules Applied'** column, showing the exact logic and values used to define each segment.
6.  **Audience Scoring:** A weighted `Overall Score` prioritizes segments.

## Future Improvements
- **Automated Visualization:** Adding automated plotting of segment sizes and score distributions.
- **Advanced Scoring:** In production, the simulated `lift_vs_control` score would be replaced with data from historical A/B tests or predictive models.
"""
    with open(readme_path, "w") as f: f.write(readme_content)

    print(f"   - Success! Files saved in 'submission/' folder.")
    return final_df

In [20]:
# --- Main Execution ---
if __name__ == "__main__":
    print("1. Setting up configuration...")
    mock_data = generate_mock_data(NUM_USERS, TODAY)
    universe_data, thresholds = setup_universe_and_thresholds(mock_data, UNIVERSE_DAYS, TODAY)
    segmented_data, rules_map = create_mece_segments(universe_data, thresholds)
    constrained_data, final_rules_map = enforce_size_constraints(segmented_data, rules_map)
    scored_segments = compute_scores(constrained_data, len(universe_data))
    final_output_df = create_output_files(scored_segments, final_rules_map)

    print("\n--- 🚀 Final Strategy ---")
    display(final_output_df)

1. Setting up configuration...
2. Generating mock data...
3. Defining universe and calculating hybrid thresholds...
   - Universe size: 79257 users
   - Calculated Hybrid Thresholds: {'PROFITABILITY_HIGH': np.float64(0.79), 'AOV_HIGH': 2500, 'AOV_MID': np.float64(522.19), 'ENGAGEMENT_HIGH': np.float64(0.58), 'SESSIONS_HIGH': 11, 'CART_ITEMS_HIGH': 13}
4. Creating MECE segments and generating rules...
   - All users assigned to an initial segment.
5. Enforcing segment size constraints (Min & Max)...
     - Splitting 'Other Bucket' by profitability...
   - Constraint enforcement complete.
6. Computing audience scores...
7. Generating final deliverables...
   - Success! Files saved in 'submission/' folder.

--- 🚀 Final Strategy ---


Unnamed: 0,segment,Rules Applied,size,overall_score,conv_pot,profitability,lift,strategic_fit,size_score,valid
0,"Rising Stars: Mid AOV, High Profit",AOV > 522.19 & profitability > 0.79,5259,0.4387,0.2209,0.865,0.1304,0.6798,0.0664,True
1,"Engaged Potentials: Low AOV, High Engagement",engagement > 0.58,8123,0.432,0.3863,0.6005,0.1891,0.6384,0.1025,True
2,Other Bucket (High Profit),(ELSE) AND profitability >= 0.62,15843,0.3892,0.1807,0.7677,0.1158,0.5922,0.1999,True
3,High Intent: Large Cart,num_cart_items > 13,5605,0.3433,0.2225,0.5981,0.1299,0.5188,0.0707,True
4,Power Shoppers: High Session Activity,sessions > 11,7295,0.3368,0.2217,0.5783,0.1296,0.5062,0.092,True
5,"Prospects: Mid AOV, Other",AOV > 522.19,19220,0.3292,0.2217,0.5355,0.1304,0.4821,0.2425,True
6,"Whales: High AOV, Other",AOV > 2500,2071,0.316,0.1717,0.6,0.1145,0.4889,0.0261,True
7,Other Bucket (Low Profit),(ELSE) AND profitability < 0.62,15841,0.2701,0.1822,0.4349,0.1162,0.3934,0.1999,True
