In [None]:
%pip install eli5



In [None]:
# telecom_churn_analysis.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import eli5
from eli5.sklearn import PermutationImportance
from IPython.display import display, HTML

In [None]:

# --- 1. Data Simulation (Conceptual SQL Data Aggregation) ---
# In a real scenario, you would connect to a SQL database and aggregate data
# using queries similar to the conceptual aggregation shown below.
# For this example, we'll simulate a dataset that mimics aggregated telecom data.

print("--- Simulating Telecom Customer Data ---")
np.random.seed(42) # for reproducibility

num_customers = 5000

data = {
    'customer_id': range(1, num_customers + 1),
    'age': np.random.randint(18, 70, num_customers),
    'gender': np.random.choice(['Male', 'Female'], num_customers, p=[0.5, 0.5]),
    'monthly_bill': np.random.uniform(20, 150, num_customers),
    'total_data_usage_gb': np.random.uniform(5, 100, num_customers),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], num_customers, p=[0.6, 0.2, 0.2]),
    'num_complaints': np.random.randint(0, 5, num_customers), # Aggregated from SQL
    'avg_call_duration_min': np.random.uniform(10, 300, num_customers), # Aggregated from SQL
    'recharge_frequency_per_month': np.random.randint(1, 10, num_customers), # Aggregated from SQL
    'customer_service_calls': np.random.randint(0, 8, num_customers),
    'num_dependents': np.random.randint(0, 4, num_customers),
    'senior_citizen': np.random.choice([0, 1], num_customers, p=[0.8, 0.2]),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], num_customers, p=[0.35, 0.45, 0.2]),
    'has_online_security': np.random.choice([0, 1], num_customers, p=[0.7, 0.3]),
    'has_online_backup': np.random.choice([0, 1], num_customers, p=[0.65, 0.35]),
    'has_device_protection': np.random.choice([0, 1], num_customers, p=[0.6, 0.4]),
    'has_tech_support': np.random.choice([0, 1], num_customers, p=[0.75, 0.25]),
    'churn': np.random.choice([0, 1], num_customers, p=[0.8, 0.2]) # 0: No Churn, 1: Churn
}

df = pd.DataFrame(data)



--- Simulating Telecom Customer Data ---


In [None]:
# Introduce some correlation with churn
# Customers with month-to-month contracts, higher complaints, lower call duration,
# and higher monthly bills are more likely to churn.
df.loc[df['contract_type'] == 'Month-to-month', 'churn'] = np.random.choice([0, 1], size=len(df[df['contract_type'] == 'Month-to-month']), p=[0.6, 0.4])
df.loc[df['num_complaints'] >= 3, 'churn'] = np.random.choice([0, 1], size=len(df[df['num_complaints'] >= 3]), p=[0.5, 0.5])
df.loc[df['avg_call_duration_min'] < 50, 'churn'] = np.random.choice([0, 1], size=len(df[df['avg_call_duration_min'] < 50]), p=[0.6, 0.4])
df.loc[df['monthly_bill'] > 100, 'churn'] = np.random.choice([0, 1], size=len(df[df['monthly_bill'] > 100]), p=[0.65, 0.35])

# Ensure churn distribution is reasonable
df['churn'] = df['churn'].astype(int)

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())
print("\nChurn distribution:")
print(df['churn'].value_counts(normalize=True))


Dataset shape: (5000, 18)

First 5 rows of the dataset:
   customer_id  age  gender  monthly_bill  total_data_usage_gb  \
0            1   56  Female     77.631642            38.828519   
1            2   69    Male     96.162197            21.134746   
2            3   46  Female    146.890183            15.099053   
3            4   32    Male     89.199567            91.140525   
4            5   60  Female    137.691838            99.776782   

    contract_type  num_complaints  avg_call_duration_min  \
0  Month-to-month               3             136.721716   
1  Month-to-month               0             183.666482   
2  Month-to-month               3             290.538904   
3  Month-to-month               3             264.334803   
4  Month-to-month               3              91.415166   

   recharge_frequency_per_month  customer_service_calls  num_dependents  \
0                             5                       3               2   
1                             8     

In [None]:
# --- 2. Data Preprocessing ---
print("\n--- Data Preprocessing ---")

# Handle missing values (if any - for simulated data, there aren't explicit NaNs)
# In a real dataset, you might do: df.fillna(df.median(), inplace=True) or more complex imputation.

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=np.number).columns.drop(['customer_id', 'churn'])

print(f"Categorical columns: {list(categorical_cols)}")
print(f"Numerical columns: {list(numerical_cols)}")

# Apply One-Hot Encoding to categorical features
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Scale numerical features
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("\nFirst 5 rows after encoding and scaling:")
print(df_encoded.head())


--- Data Preprocessing ---
Categorical columns: ['gender', 'contract_type', 'internet_service']
Numerical columns: ['age', 'monthly_bill', 'total_data_usage_gb', 'num_complaints', 'avg_call_duration_min', 'recharge_frequency_per_month', 'customer_service_calls', 'num_dependents', 'senior_citizen', 'has_online_security', 'has_online_backup', 'has_device_protection', 'has_tech_support']

First 5 rows after encoding and scaling:
   customer_id       age  monthly_bill  total_data_usage_gb  num_complaints  \
0            1  0.832265     -0.193182            -0.509847        0.685307   
1            2  1.703719      0.302806            -1.155266       -1.448277   
2            3  0.161916      1.660589            -1.375431        0.685307   
3            4 -0.776573      0.116445             1.398349        0.685307   
4            5  1.100405      1.414387             1.713375        0.685307   

   avg_call_duration_min  recharge_frequency_per_month  \
0              -0.200089            

In [None]:
# --- 3. Prepare Data for Modeling ---
X = df_encoded.drop(['customer_id', 'churn'], axis=1)
y = df_encoded['churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"\nTraining set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Training churn distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test churn distribution:\n{y_test.value_counts(normalize=True)}")


Training set shape: (3750, 18), Test set shape: (1250, 18)
Training churn distribution:
churn
0    0.621333
1    0.378667
Name: proportion, dtype: float64
Test churn distribution:
churn
0    0.6216
1    0.3784
Name: proportion, dtype: float64


In [None]:
# --- 4. Build Binary Classification Model (RandomForestClassifier) ---
print("\n--- Building RandomForestClassifier Model ---")

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') # 'balanced' to handle class imbalance
model.fit(X_train, y_train)

print("Model training complete.")


--- Building RandomForestClassifier Model ---
Model training complete.


In [None]:
# --- 5. Model Evaluation ---
print("\n--- Model Evaluation ---")

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of churn

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


--- Model Evaluation ---
Accuracy: 0.6136
Precision: 0.4609
Recall: 0.1247
F1-Score: 0.1963

Confusion Matrix:
[[708  69]
 [414  59]]

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.91      0.75       777
           1       0.46      0.12      0.20       473

    accuracy                           0.61      1250
   macro avg       0.55      0.52      0.47      1250
weighted avg       0.57      0.61      0.54      1250



In [None]:
# --- 6. Model Explainability (ELI5) ---
# ELI5 helps to understand what features are important for the model's predictions.
print("\n--- Model Explainability (ELI5) ---")

# Permutation importance for overall feature importance
perm_importance = PermutationImportance(model, random_state=42).fit(X_test, y_test)
print("\nPermutation Importance (higher score means more important):")
display(eli5.show_weights(perm_importance, feature_names=X.columns.tolist(), top=20))

# Explain individual predictions (e.g., for a churning customer from the test set)
print("\n--- Explaining an Individual Prediction ---")
# Find an example of a customer predicted to churn
churn_pred_indices = np.where(y_pred == 1)[0]
if len(churn_pred_indices) > 0:
    example_idx = churn_pred_indices[0] # Take the first one
    sample_customer_data = X_test.iloc[example_idx]
    true_churn_status = y_test.iloc[example_idx]

    print(f"True churn status for this customer: {'Churned' if true_churn_status == 1 else 'Not Churned'}")
    print(f"Model predicted churn status: {'Churned' if y_pred[example_idx] == 1 else 'Not Churned'}")

    # Use eli5.show_prediction to explain why the model made this prediction
    print("\nELI5 explanation for this individual prediction:")
    display(eli5.show_prediction(model, sample_customer_data, feature_names=X.columns.tolist()))
else:
    print("No customers predicted to churn in the test set to explain.")


--- Model Explainability (ELI5) ---

Permutation Importance (higher score means more important):


Weight,Feature
0.0110  ± 0.0031,num_complaints
0.0061  ± 0.0103,monthly_bill
0.0046  ± 0.0063,contract_type_One year
0.0046  ± 0.0049,has_device_protection
0.0019  ± 0.0031,has_online_security
0.0019  ± 0.0084,total_data_usage_gb
0.0018  ± 0.0034,gender_Male
0.0010  ± 0.0031,num_dependents
-0.0005  ± 0.0056,has_tech_support
-0.0005  ± 0.0046,senior_citizen



--- Explaining an Individual Prediction ---
True churn status for this customer: Not Churned
Model predicted churn status: Churned

ELI5 explanation for this individual prediction:




Contribution?,Feature
0.501,<BIAS>
0.06,monthly_bill
0.028,contract_type_One year
0.024,has_online_backup
0.023,num_dependents
0.018,customer_service_calls
0.016,internet_service_Fiber optic
0.015,contract_type_Two year
0.012,recharge_frequency_per_month
0.006,has_device_protection


In [None]:
# --- 7. Customer Segmentation ---
# Segment customers based on churn probability and other factors.
# This can be done post-prediction or based on derived rules.
print("\n--- Customer Segmentation ---")

# Predict churn probabilities for the entire dataset
df['churn_probability'] = model.predict_proba(X)[:, 1]

# Define segmentation rules
# These rules can be refined based on business knowledge and model insights.
def segment_customer(row):
    if row['churn'] == 1 or row['churn_probability'] > 0.6:
        # High probability of churn or already churned (if looking at historical churn)
        return 'At Risk / Churned'
    elif row['contract_type'] == 'Month-to-month' and row['churn_probability'] > 0.4:
        # Month-to-month contracts often pose higher churn risk
        return 'At Risk'
    elif row['num_complaints'] >= 2 and row['churn_probability'] > 0.3:
        # High complaints indicate potential dissatisfaction
        return 'At Risk'
    elif row['avg_call_duration_min'] < 60 and row['recharge_frequency_per_month'] < 3 and row['churn_probability'] < 0.2:
        # Low engagement, but low churn probability for now, might be dormant
        return 'Dormant'
    else:
        return 'Loyal / Stable'

df['customer_segment'] = df.apply(segment_customer, axis=1)

print("\nCustomer Segment Distribution:")
print(df['customer_segment'].value_counts(normalize=True))

print("\nSample of segmented customers:")
print(df[['customer_id', 'monthly_bill', 'contract_type', 'num_complaints', 'churn_probability', 'customer_segment']].sample(10, random_state=42))


--- Customer Segmentation ---

Customer Segment Distribution:
customer_segment
Loyal / Stable       0.5200
At Risk / Churned    0.3808
At Risk              0.0836
Dormant              0.0156
Name: proportion, dtype: float64

Sample of segmented customers:
      customer_id  monthly_bill   contract_type  num_complaints  \
1501         1502    116.068968  Month-to-month               4   
2586         2587     99.841151  Month-to-month               1   
2653         2654     78.701027        One year               0   
1055         1056     66.167129  Month-to-month               4   
705           706     37.805662        One year               2   
106           107     57.645752        One year               4   
589           590     21.201406  Month-to-month               4   
2468         2469    146.268670  Month-to-month               4   
2413         2414    123.519024  Month-to-month               4   
1600         1601     77.025200  Month-to-month               4   

     

In [None]:
# --- Final Deliverables Considerations ---
print("\n--- Deliverables Notes ---")
print("This script provides the core ML notebook. For the 'Customer Churn Report (PowerPoint)'")
print("and 'Final Recommendations', you would synthesize the findings from this analysis.")
print("Key points to include would be:")
print(" - Churn rate overview")
print(" - Most important features driving churn (from ELI5)")
print(" - Characteristics of 'At Risk', 'Loyal', and 'Dormant' segments")
print(" - Actionable strategies tailored to each segment to improve retention.")
print("\nExample strategies for 'At Risk' customers might include proactive outreach, loyalty programs, or targeted offers.")
print("For 'Dormant' customers, re-engagement campaigns or special service bundles could be considered.")
print("For 'Loyal' customers, focus on continued excellent service and upselling/cross-selling relevant products.")



--- Deliverables Notes ---
This script provides the core ML notebook. For the 'Customer Churn Report (PowerPoint)'
and 'Final Recommendations', you would synthesize the findings from this analysis.
Key points to include would be:
 - Churn rate overview
 - Most important features driving churn (from ELI5)
 - Characteristics of 'At Risk', 'Loyal', and 'Dormant' segments
 - Actionable strategies tailored to each segment to improve retention.

Example strategies for 'At Risk' customers might include proactive outreach, loyalty programs, or targeted offers.
For 'Dormant' customers, re-engagement campaigns or special service bundles could be considered.
For 'Loyal' customers, focus on continued excellent service and upselling/cross-selling relevant products.
