# 04 - Cohort Retention Analysis

Customer retention and churn analysis for Olist marketplace.

**Key Questions:**
1. What is the monthly cohort retention rate?
2. Which cohorts perform best/worst and why?
3. What is the revenue impact of improving retention?
4. What is the estimated Customer Lifetime Value (CLV)?

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from pathlib import Path

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
PROJECT_ROOT = Path.cwd().parent
DB_PATH = PROJECT_ROOT / 'data' / 'olist_ecommerce.db'
IMAGES_PATH = PROJECT_ROOT / 'images'

print(f"Database path: {DB_PATH}")
print(f"Images path: {IMAGES_PATH}")

## 1. Data Loading

Load orders, customers, and payments data from SQLite database.

In [None]:
# Connect to database and load data
conn = sqlite3.connect(DB_PATH)

# Load orders with customer information
query = """
SELECT 
    o.order_id,
    o.customer_id,
    c.customer_unique_id,
    o.order_status,
    o.order_purchase_timestamp,
    o.order_delivered_customer_date
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.order_status = 'delivered'
"""
orders_df = pd.read_sql_query(query, conn)

# Load payment data
payments_query = """
SELECT order_id, SUM(payment_value) as total_payment
FROM order_payments
GROUP BY order_id
"""
payments_df = pd.read_sql_query(payments_query, conn)

conn.close()

# Merge orders with payments
df = orders_df.merge(payments_df, on='order_id', how='left')

# Convert timestamps
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

print(f"Total delivered orders: {len(df):,}")
print(f"Unique customers: {df['customer_unique_id'].nunique():,}")
print(f"Date range: {df['order_purchase_timestamp'].min().date()} to {df['order_purchase_timestamp'].max().date()}")
df.head()

## 2. Cohort Retention Analysis

Build cohort retention matrix to analyze customer retention over time.

In [None]:
def build_cohort_retention(df):
    """
    Build a cohort retention matrix from order data.
    
    Parameters:
    -----------
    df : DataFrame with 'customer_unique_id' and 'order_purchase_timestamp'
    
    Returns:
    --------
    retention : DataFrame with retention percentages
    cohort_sizes : Series with cohort sizes
    """
    # Create a working copy
    data = df.copy()
    
    # Get order month for each transaction
    data['order_month'] = data['order_purchase_timestamp'].dt.to_period('M')
    
    # Get first purchase month per customer (cohort assignment)
    cohorts = data.groupby('customer_unique_id')['order_month'].min().reset_index()
    cohorts.columns = ['customer_unique_id', 'cohort_month']
    
    # Merge cohort information back
    data = data.merge(cohorts, on='customer_unique_id')
    
    # Calculate month offset from cohort month
    data['month_offset'] = (data['order_month'] - data['cohort_month']).apply(lambda x: x.n)
    
    # Build retention matrix: count unique customers per cohort-month combination
    cohort_data = data.groupby(['cohort_month', 'month_offset'])['customer_unique_id'].nunique().reset_index()
    cohort_pivot = cohort_data.pivot(index='cohort_month', columns='month_offset', values='customer_unique_id')
    
    # Get cohort sizes (month 0 = first purchase)
    cohort_sizes = cohort_pivot[0]
    
    # Convert to retention percentages
    retention = cohort_pivot.divide(cohort_sizes, axis=0) * 100
    
    return retention, cohort_sizes, data

retention, cohort_sizes, cohort_df = build_cohort_retention(df)

print(f"Number of cohorts: {len(retention)}")
print(f"Max months tracked: {retention.columns.max()}")
print(f"\nCohort sizes (first 10):")
print(cohort_sizes.head(10))

### 2.1 Cohort Retention Heatmap

The key visualization showing how customer retention evolves over time for each cohort.

In [None]:
# Create cohort retention heatmap
fig, ax = plt.subplots(figsize=(16, 12))

# Format index for display (convert Period to string)
retention_display = retention.copy()
retention_display.index = retention_display.index.astype(str)

# Create heatmap - limit to first 12 months for readability
max_months = min(12, retention.columns.max())
retention_subset = retention_display.iloc[:, :max_months + 1]

sns.heatmap(
    retention_subset,
    annot=True,
    fmt='.1f',
    cmap='YlOrRd_r',
    linewidths=0.5,
    vmin=0,
    vmax=100,
    ax=ax,
    cbar_kws={'label': 'Retention Rate (%)'}
)

ax.set_title('Monthly Cohort Retention Rate (%)', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Months Since First Purchase', fontsize=12)
ax.set_ylabel('Cohort (First Purchase Month)', fontsize=12)

plt.tight_layout()
plt.savefig(IMAGES_PATH / 'cohort_retention_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"âœ“ Heatmap saved to {IMAGES_PATH / 'cohort_retention_heatmap.png'}")

## 3. Key Retention Metrics

Calculate critical retention metrics and identify best/worst performing cohorts.

In [None]:
# Calculate key retention metrics

# Month 1 retention (customers who came back the month after first purchase)
month_1_retention = retention[1].dropna()
avg_month_1_retention = month_1_retention.mean()

# Month 3 retention (3 months after first purchase)
month_3_retention = retention[3].dropna() if 3 in retention.columns else pd.Series([])
avg_month_3_retention = month_3_retention.mean() if len(month_3_retention) > 0 else 0

# Month 6 retention
month_6_retention = retention[6].dropna() if 6 in retention.columns else pd.Series([])
avg_month_6_retention = month_6_retention.mean() if len(month_6_retention) > 0 else 0

# Churn rate (inverse of month 1 retention)
churn_rate = 100 - avg_month_1_retention

print("=" * 60)
print("KEY RETENTION METRICS")
print("=" * 60)
print(f"\nAverage Month-1 Retention Rate: {avg_month_1_retention:.2f}%")
print(f"Average Month-3 Retention Rate: {avg_month_3_retention:.2f}%")
print(f"Average Month-6 Retention Rate: {avg_month_6_retention:.2f}%")
print(f"\nCustomer Churn Rate (after 1 month): {churn_rate:.2f}%")
print("\n" + "=" * 60)

In [None]:
# Identify best and worst performing cohorts (by month-1 retention)

# Best performing cohorts
best_cohorts = month_1_retention.nlargest(5)
worst_cohorts = month_1_retention.nsmallest(5)

print("BEST PERFORMING COHORTS (Month-1 Retention)")
print("-" * 45)
for cohort, rate in best_cohorts.items():
    size = cohort_sizes[cohort]
    print(f"{cohort}: {rate:.2f}% retention (cohort size: {size:,})")

print("\n" + "WORST PERFORMING COHORTS (Month-1 Retention)")
print("-" * 45)
for cohort, rate in worst_cohorts.items():
    size = cohort_sizes[cohort]
    print(f"{cohort}: {rate:.2f}% retention (cohort size: {size:,})")

In [None]:
# Visualize retention trends over time
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Average retention curve
avg_retention_by_month = retention.mean()
ax1 = axes[0]
ax1.plot(avg_retention_by_month.index[:13], avg_retention_by_month.values[:13], 
         marker='o', linewidth=2, markersize=8, color='#e74c3c')
ax1.fill_between(avg_retention_by_month.index[:13], avg_retention_by_month.values[:13], 
                  alpha=0.3, color='#e74c3c')
ax1.set_xlabel('Months Since First Purchase', fontsize=11)
ax1.set_ylabel('Average Retention Rate (%)', fontsize=11)
ax1.set_title('Average Retention Curve', fontsize=13, fontweight='bold')
ax1.set_ylim(0, 105)
ax1.grid(True, alpha=0.3)

# Plot 2: Cohort size over time
ax2 = axes[1]
cohort_sizes_sorted = cohort_sizes.sort_index()
ax2.bar(range(len(cohort_sizes_sorted)), cohort_sizes_sorted.values, color='#3498db', alpha=0.7)
ax2.set_xlabel('Cohort', fontsize=11)
ax2.set_ylabel('Number of Customers', fontsize=11)
ax2.set_title('Cohort Sizes Over Time', fontsize=13, fontweight='bold')
ax2.set_xticks(range(0, len(cohort_sizes_sorted), max(1, len(cohort_sizes_sorted)//10)))
ax2.set_xticklabels([str(cohort_sizes_sorted.index[i]) for i in range(0, len(cohort_sizes_sorted), max(1, len(cohort_sizes_sorted)//10))],
                    rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(IMAGES_PATH / 'retention_trends.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Revenue Impact Analysis

Quantify the business impact of improving customer retention.

In [None]:
# Calculate revenue metrics for impact analysis

# Average order value
avg_order_value = df['total_payment'].mean()

# Total customers acquired per month (average)
avg_monthly_new_customers = cohort_sizes.mean()

# Annual new customers
annual_new_customers = avg_monthly_new_customers * 12

# Current month-1 retention rate
current_retention = avg_month_1_retention

# Calculate revenue impact of 5% retention improvement
retention_improvement = 5  # percentage points

# Additional retained customers per cohort
additional_retained_per_cohort = avg_monthly_new_customers * (retention_improvement / 100)

# Annual additional retained customers
annual_additional_retained = additional_retained_per_cohort * 12

# Revenue impact (assuming retained customers make at least one more purchase)
annual_revenue_impact = annual_additional_retained * avg_order_value

print("=" * 60)
print("REVENUE IMPACT OF 5% RETENTION IMPROVEMENT")
print("=" * 60)
print(f"\nCurrent Metrics:")
print(f"  - Average Order Value: R$ {avg_order_value:,.2f}")
print(f"  - Avg Monthly New Customers: {avg_monthly_new_customers:,.0f}")
print(f"  - Current Month-1 Retention: {current_retention:.2f}%")
print(f"\nImprovement Scenario (Month-1 retention +5%):")
print(f"  - New Month-1 Retention: {current_retention + 5:.2f}%")
print(f"  - Additional Retained Customers/Month: {additional_retained_per_cohort:,.0f}")
print(f"  - Additional Retained Customers/Year: {annual_additional_retained:,.0f}")
print(f"\n{'=' * 60}")
print(f"ESTIMATED ANNUAL REVENUE IMPACT: R$ {annual_revenue_impact:,.2f}")
print(f"{'=' * 60}")

## 5. Customer Lifetime Value (CLV) Estimation

Estimate CLV using the simple formula: CLV = Avg Order Value x Purchase Frequency x Customer Lifespan

In [None]:
# Calculate CLV components

# 1. Average Order Value (already calculated)
aov = avg_order_value

# 2. Purchase Frequency (orders per customer)
orders_per_customer = df.groupby('customer_unique_id')['order_id'].count()
avg_purchase_frequency = orders_per_customer.mean()

# 3. Customer Lifespan (estimated from retention decay)
# Using the retention curve to estimate average lifespan
# Simplified: sum of retention rates / 100 gives expected purchases over time
retention_curve = retention.mean()
expected_purchases_from_retention = retention_curve.sum() / 100

# Alternative: Calculate actual observed lifespan
customer_lifespan = cohort_df.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': ['min', 'max']
})
customer_lifespan.columns = ['first_purchase', 'last_purchase']
customer_lifespan['lifespan_days'] = (customer_lifespan['last_purchase'] - customer_lifespan['first_purchase']).dt.days
avg_lifespan_days = customer_lifespan['lifespan_days'].mean()
avg_lifespan_months = avg_lifespan_days / 30

# Calculate CLV (Simple Method)
# Since most customers only order once, CLV ~ AOV for this dataset
simple_clv = aov * avg_purchase_frequency

# Calculate CLV with retention-adjusted lifespan
# This accounts for repeat purchases based on retention rates
retention_adjusted_clv = aov * (1 + expected_purchases_from_retention)

print("=" * 60)
print("CUSTOMER LIFETIME VALUE (CLV) ESTIMATION")
print("=" * 60)
print(f"\nCLV Components:")
print(f"  - Average Order Value (AOV): R$ {aov:,.2f}")
print(f"  - Avg Purchase Frequency: {avg_purchase_frequency:.2f} orders/customer")
print(f"  - Avg Customer Lifespan: {avg_lifespan_days:.0f} days ({avg_lifespan_months:.1f} months)")
print(f"\nCLV Calculations:")
print(f"  - Simple CLV (AOV x Frequency): R$ {simple_clv:,.2f}")
print(f"  - Retention-Adjusted CLV: R$ {retention_adjusted_clv:,.2f}")
print(f"\n{'=' * 60}")

In [None]:
# Distribution of customer orders
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Order frequency distribution
ax1 = axes[0]
order_counts = orders_per_customer.value_counts().sort_index()
order_counts_limited = order_counts[order_counts.index <= 10]  # Limit to 10 orders for readability
ax1.bar(order_counts_limited.index, order_counts_limited.values, color='#2ecc71', alpha=0.8)
ax1.set_xlabel('Number of Orders', fontsize=11)
ax1.set_ylabel('Number of Customers', fontsize=11)
ax1.set_title('Customer Order Frequency Distribution', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='y')

# Add percentage labels
total_customers = orders_per_customer.count()
for i, (orders, count) in enumerate(order_counts_limited.items()):
    pct = (count / total_customers) * 100
    ax1.annotate(f'{pct:.1f}%', (orders, count), ha='center', va='bottom', fontsize=9)

# Plot 2: Repeat customer breakdown
ax2 = axes[1]
one_time = (orders_per_customer == 1).sum()
repeat = (orders_per_customer > 1).sum()
labels = ['One-time\nCustomers', 'Repeat\nCustomers']
sizes = [one_time, repeat]
colors = ['#e74c3c', '#27ae60']
explode = (0, 0.05)

wedges, texts, autotexts = ax2.pie(sizes, explode=explode, labels=labels, colors=colors,
                                    autopct='%1.1f%%', startangle=90, textprops={'fontsize': 11})
ax2.set_title('One-time vs Repeat Customers', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig(IMAGES_PATH / 'customer_frequency_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nOne-time customers: {one_time:,} ({one_time/total_customers*100:.1f}%)")
print(f"Repeat customers: {repeat:,} ({repeat/total_customers*100:.1f}%)")

## 6. Summary & Key Findings

In [None]:
# Summary statistics
print("=" * 70)
print("COHORT RETENTION ANALYSIS - EXECUTIVE SUMMARY")
print("=" * 70)

print(f"""
DATASET OVERVIEW
----------------
- Total Delivered Orders: {len(df):,}
- Unique Customers: {df['customer_unique_id'].nunique():,}
- Analysis Period: {df['order_purchase_timestamp'].min().strftime('%Y-%m-%d')} to {df['order_purchase_timestamp'].max().strftime('%Y-%m-%d')}

RETENTION METRICS
-----------------
- Month-1 Retention Rate: {avg_month_1_retention:.2f}%
- Month-3 Retention Rate: {avg_month_3_retention:.2f}%
- Month-6 Retention Rate: {avg_month_6_retention:.2f}%
- Customer Churn Rate: {churn_rate:.2f}%

CUSTOMER BEHAVIOR
-----------------
- Average Order Value: R$ {avg_order_value:,.2f}
- Avg Purchase Frequency: {avg_purchase_frequency:.2f} orders/customer
- One-time Customers: {one_time/total_customers*100:.1f}%
- Repeat Customers: {repeat/total_customers*100:.1f}%

CUSTOMER LIFETIME VALUE
-----------------------
- Simple CLV: R$ {simple_clv:,.2f}
- Retention-Adjusted CLV: R$ {retention_adjusted_clv:,.2f}

REVENUE IMPACT OF 5% RETENTION IMPROVEMENT
------------------------------------------
- Additional Retained Customers/Year: {annual_additional_retained:,.0f}
- Estimated Annual Revenue Impact: R$ {annual_revenue_impact:,.2f}

KEY INSIGHTS
------------
1. Very low repeat purchase rate indicates this is primarily a one-time purchase marketplace
2. The steep retention drop-off after Month 0 suggests opportunities for:
   - Post-purchase engagement campaigns
   - Loyalty programs
   - Cross-selling/upselling strategies
3. Best performing cohorts may correlate with promotional periods or seasonal events
4. Even small improvements in retention can have significant revenue impact
""")
print("=" * 70)