<a href="https://colab.research.google.com/github/P3drio/Customer-Segmentation/blob/main/03_cohort_retention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================
# 03_cohort_retention.ipynb
# Customer Value & Retention Analysis
# ===============================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 1️⃣ Load cleaned data
df = pd.read_csv('/content/clean_online_retail.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# 2️⃣ Extract InvoiceMonth and CohortMonth
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M')
df['CohortMonth'] = df.groupby('Customer ID')['InvoiceMonth'].transform('min')

# 3️⃣ Compute Cohort Index (months since first purchase)
def get_month_diff(end, start):
    return (end.year - start.year) * 12 + (end.month - start.month) + 1

df['CohortIndex'] = get_month_diff(df['InvoiceMonth'].dt.to_timestamp(), df['CohortMonth'].dt.to_timestamp())

# 4️⃣ Build Cohort Table
cohort_data = df.groupby(['CohortMonth', 'CohortIndex'])['Customer ID'].nunique().reset_index()

# 5️⃣ Pivot table
cohort_pivot = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='Customer ID')

# 6️⃣ Calculate retention rates
cohort_size = cohort_pivot.iloc[:,0]
retention = cohort_pivot.divide(cohort_size, axis=0).round(3)

# 7️⃣ Visualize retention
plt.figure(figsize=(12,6))
sns.heatmap(retention, annot=True, fmt='.0%', cmap='Blues')
plt.title('Customer Retention by Monthly Cohort')
plt.ylabel('Cohort (First Purchase Month)')
plt.xlabel('Months Since First Purchase')
plt.show()

# 8️⃣ Save retention data
retention.to_csv('/content/cohort_retention.csv')
print("✅ Cohort retention matrix saved as cohort_retention.csv")