In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# -------------------------------
# Load and clean data
# -------------------------------
df = pd.read_csv('Telco-Customer-Churn.csv')

# Remove empty TotalCharges rows and convert to float
df = df[df['TotalCharges'] != ' '].copy()
df['TotalCharges'] = df['TotalCharges'].astype(float)

# Encode Churn column
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Label encode all object columns except customerID
for col in df.select_dtypes(include='object').columns:
    if col != 'customerID':
        df[col] = LabelEncoder().fit_transform(df[col])

# -------------------------------
# Step 1: Customer Clustering
# -------------------------------
features = df.drop(columns=['customerID', 'Churn'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init='auto')
df['Segment'] = kmeans.fit_predict(X_scaled)

# -------------------------------
# Step 2: Segment-Aware Modeling
# -------------------------------
segment_scores = {}

for seg in df['Segment'].unique():
    seg_data = df[df['Segment'] == seg]
    
    # Drop customerID and Segment from features
    X = seg_data.drop(columns=['customerID', 'Churn', 'Segment'])
    y = seg_data['Churn']

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Random Forest model
    clf = XGBClassifier( eval_metric='logloss', random_state=42)
    clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_pred_prob)
    segment_scores[f"Segment {seg}"] = round(score, 3)

# -------------------------------
# Final Output
# -------------------------------
print("📊 Segment-wise Churn Model AUC Scores:")
for seg, score in segment_scores.items():
    print(f"{seg}: {score}")


📊 Segment-wise Churn Model AUC Scores:
Segment 2: 0.726
Segment 0: 0.752
Segment 1: 0.744


In [4]:
import random

# Load and clean dataset
df = pd.read_csv("Telco-Customer-Churn.csv")
df = df[df['TotalCharges'] != ' ']  # remove empty rows
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})  # convert to binary

# Simulation parameters
reduction = 0.20                 # churn reduction due to intervention
intervention_cost = 10          # cost per user in test group
ARPU = df['MonthlyCharges'].mean()  # average revenue per user per month

# Split dataset randomly into control and test
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
mid = len(df_shuffled) // 2
control_group = df_shuffled.iloc[:mid].copy()
test_group = df_shuffled.iloc[mid:].copy()

# Apply churn reduction to test group (simulate intervention success)
# For test group, flip some churned customers to non-churned based on reduction %
churned_test_indices = test_group[test_group['Churn'] == 1].index.tolist()
num_to_retain = int(len(churned_test_indices) * reduction)
retained_indices = random.sample(churned_test_indices, num_to_retain)
test_group.loc[retained_indices, 'Churn'] = 0  # simulate successful retention

# Churn rates
churn_rate_control = control_group['Churn'].mean()
churn_rate_test = test_group['Churn'].mean()
retention_gain = churn_rate_control - churn_rate_test

# Calculate ROI
customers_saved = int(retention_gain * len(test_group))
revenue_retained = customers_saved * ARPU
total_cost = len(test_group) * intervention_cost
ROI = revenue_retained / total_cost

# Print results
print("🧪 A/B Test Simulation on Telco Dataset")
print(f"Churn Rate (Control): {churn_rate_control:.3f}")
print(f"Churn Rate (Test):    {churn_rate_test:.3f}")
print(f"Churn Reduction:      {retention_gain:.2%}")
print(f"Customers Retained:   {customers_saved}")
print(f"Revenue Retained:     ${revenue_retained:,.2f}")
print(f"Total Intervention Cost: ${total_cost:,.2f}")
print(f"Estimated ROI:        {ROI:.2f}x")


🧪 A/B Test Simulation on Telco Dataset
Churn Rate (Control): 0.268
Churn Rate (Test):    0.211
Churn Reduction:      5.69%
Customers Retained:   199
Revenue Retained:     $12,894.84
Total Intervention Cost: $35,160.00
Estimated ROI:        0.37x


In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

# Drop rows with missing/blank TotalCharges
df = df[df['TotalCharges'] != ' '].copy()
df['TotalCharges'] = df['TotalCharges'].astype(float)

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Drop non-feature columns
df.drop(columns=['customerID'], inplace=True)

# Label encode categorical features
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# 1. Segment customers using KMeans
features = df.drop(columns=['Churn'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)
kmeans = KMeans(n_clusters=3, random_state=42)
df['Segment'] = kmeans.fit_predict(X_scaled)

# 2. Segment-specific churn modeling + intervention simulation
ARPU = 50  # Average revenue per user
intervention_effectiveness = {0: 0.08, 1: 0.05, 2: 0.01}
intervention_cost = {0: 15, 1: 5, 2: 0}

results = []

for segment in sorted(df['Segment'].unique()):
    seg_df = df[df['Segment'] == segment]
    X = seg_df.drop(columns=['Churn', 'Segment'])
    y = seg_df['Churn']

    # Train churn prediction model
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)

    # Simulate intervention outcome
    base_churn = y_test.mean()
    reduced_churn = base_churn - intervention_effectiveness[segment]
    churn_reduction = base_churn - reduced_churn
    num_customers = len(seg_df)
    customers_saved = int(churn_reduction * num_customers)
    revenue_retained = customers_saved * ARPU
    cost = num_customers * intervention_cost[segment]
    roi = revenue_retained / cost if cost > 0 else float('inf')

    results.append({
        'Segment': segment,
        'Base Churn Rate': round(base_churn, 3),
        'AUC': round(auc, 3),
        'Churn Reduction': round(churn_reduction, 3),
        'Customers Saved': customers_saved,
        'Revenue Retained ($)': revenue_retained,
        'Intervention Cost ($)': cost,
        'ROI': round(roi, 2) if roi != float('inf') else '∞'
    })

# Display final table
summary_df = pd.DataFrame(results)
print(summary_df.to_string(index=False))


 Segment  Base Churn Rate   AUC  Churn Reduction  Customers Saved  Revenue Retained ($)  Intervention Cost ($)   ROI
       0            0.148 0.783             0.08              185                  9250                  34770  0.27
       1            0.075 0.756             0.05               77                  3850                   7730   0.5
       2            0.445 0.728             0.01               31                  1550                      0     ∞
