In [98]:
import os
import warnings
warnings.filterwarnings('ignore')

In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [100]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, roc_auc_score,
                             roc_curve, mean_squared_error, r2_score)
import joblib
import json

In [101]:
# --------------------------- Configuration ---------------------------
DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"   # change if your file is elsewhere 
OUTPUT_DIR = "telco_project_output"
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [102]:
# --------------------------- Helper Functions ---------------------------
def save_fig(fig, name):
    path = os.path.join(OUTPUT_DIR, name)
    fig.savefig(path, bbox_inches='tight', dpi=150)
    plt.close(fig)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [103]:
# --------------------------- 1. Load Data ---------------------------
print("Loading data from:", DATA_PATH)
df = pd.read_csv(DATA_PATH)
print("Loaded data shape:", df.shape)
print()

Loading data from: WA_Fn-UseC_-Telco-Customer-Churn.csv
Loaded data shape: (7043, 21)



In [104]:
# --------------------------- 2. Basic Cleaning ---------------------------
# Convert TotalCharges to numeric (there are spaces for customers with zero tenure)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill or drop missing TotalCharges - if tenure==0 then TotalCharges is NaN -> set to 0
df.loc[(df['tenure'] == 0) & (df['TotalCharges'].isna()), 'TotalCharges'] = 0.0

# Print missing values summary
print("Missing values per column before drop:\n", df.isna().sum())
# Drop any remaining missing rows (should be minimal)
df = df.dropna().reset_index(drop=True)
print("Shape after dropping NA:", df.shape)
print()

Missing values per column before drop:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Shape after dropping NA: (7043, 21)



In [105]:
# --------------------------- 3. Quick EDA (prints + basic plots saved) ---------------------------
# Churn distribution
churn_counts = df['Churn'].value_counts()
print("Churn distribution:\n", churn_counts)
print()

# Plot churn count
fig = plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Churn')
plt.title('Churn Distribution')
save_fig(fig, 'churn_distribution.png')

# MonthlyCharges distribution
fig = plt.figure(figsize=(8,4))
sns.histplot(df['MonthlyCharges'], bins=30, kde=True)
plt.title('Monthly Charges Distribution')
save_fig(fig, 'monthlycharges_dist.png')

# Tenure distribution
fig = plt.figure(figsize=(8,4))
sns.histplot(df['tenure'], bins=30, kde=True)
plt.title('Tenure Distribution')
save_fig(fig, 'tenure_dist.png')

# Correlation heatmap for numeric columns
num_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
fig = plt.figure(figsize=(6,5))
sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap='Blues')
plt.title('Numeric Feature Correlations')
save_fig(fig, 'numeric_correlations.png')

Churn distribution:
 Churn
No     5174
Yes    1869
Name: count, dtype: int64



In [106]:
# --------------------------- 4. Feature Engineering ---------------------------
# Create CLV proxy: CLV = MonthlyCharges * tenure  (simple approach)
df['CLV'] = df['MonthlyCharges'] * df['tenure']

# Average revenue per month (handle tenure==0)
df['AvgRevPerMonth'] = df.apply(lambda r: r['TotalCharges']/r['tenure'] if r['tenure']>0 else r['MonthlyCharges'], axis=1)

# Tenure groups
def tenure_group(t):
    if t == 0:
        return '0'
    elif t <= 12:
        return '0-12'
    elif t <= 24:
        return '13-24'
    elif t <= 48:
        return '25-48'
    elif t <= 60:
        return '49-60'
    else:
        return '60+'
df['TenureGroup'] = df['tenure'].apply(tenure_group)

# Binary encode 'Churn' target
df['ChurnFlag'] = df['Churn'].map({'Yes':1, 'No':0})

# Simplify PaymentMethod categories if too many
df['PaymentMethodSimple'] = df['PaymentMethod'].replace({
    'Bank transfer (automatic)':'BankTransfer',
    'Credit card (automatic)':'CreditCard',
    'Electronic check':'ElectronicCheck',
    'Mailed check':'MailedCheck'
})

# Drop customerID - not useful for modeling
if 'customerID' in df.columns:
    df_model = df.drop(columns=['customerID'])
else:
    df_model = df.copy()

# Save engineered dataset
engineered_csv = os.path.join(OUTPUT_DIR, 'telco_engineered.csv')
df_model.to_csv(engineered_csv, index=False)
print("Saved engineered dataset to:", engineered_csv)
print()


Saved engineered dataset to: telco_project_output\telco_engineered.csv



In [107]:
# --------------------------- 5. Prepare Data for Modeling ---------------------------
# Select features to use (mix of numerical and categorical)
num_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'CLV', 'AvgRevPerMonth']
cat_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethodSimple', 'TenureGroup']

# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], remainder='drop')

In [108]:
# -------------------- Load saved CLV regression model --------------------
clv_regressor_path = os.path.join(OUTPUT_DIR, 'clv_regressor.joblib')
clv_regressor = joblib.load(clv_regressor_path)
print("Loaded CLV regression pipeline.")

# Now you can make predictions
reg_preds = clv_regressor.predict(Xr_test)
reg_rmse = rmse(yr_test, reg_preds)
reg_r2 = r2_score(yr_test, reg_preds)
print(f"CLV Regression RMSE: {reg_rmse:.4f}")
print(f"CLV Regression R2: {reg_r2:.4f}")


Loaded CLV regression pipeline.
CLV Regression RMSE: 15.9151
CLV Regression R2: 1.0000


In [109]:
# --------------------------- 6A. CLV Regression ---------------------------
print("\n----- CLV Regression Modeling -----\n")
X_reg = df_model[num_features + cat_features]
y_reg = df_model['CLV']

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=RANDOM_STATE)
print("Train/Test shapes (regression):", Xr_train.shape, Xr_test.shape)

reg_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', GradientBoostingRegressor(random_state=RANDOM_STATE))
])

# Fit regression model
reg_pipeline.fit(Xr_train, yr_train)
reg_preds = reg_pipeline.predict(Xr_test)
reg_rmse = rmse(yr_test, reg_preds)
reg_r2 = r2_score(yr_test, reg_preds)
print(f"CLV Regression RMSE: {reg_rmse:.4f}")
print(f"CLV Regression R2: {reg_r2:.4f}")

# Save regression model
joblib.dump(reg_pipeline, os.path.join(OUTPUT_DIR, 'clv_regressor.joblib'))
print("Saved CLV regression pipeline.")
print()


----- CLV Regression Modeling -----

Train/Test shapes (regression): (5634, 22) (1409, 22)
CLV Regression RMSE: 15.9151
CLV Regression R2: 1.0000
Saved CLV regression pipeline.



In [110]:
# -------------------- Load saved Churn classifier model --------------------
churn_classifier_path = os.path.join(OUTPUT_DIR, 'churn_classifier.joblib')
clf_pipeline = joblib.load(churn_classifier_path)
print("Loaded Churn classification pipeline.")

# Now you can make predictions
clf_preds = clf_pipeline.predict(Xc_test)
clf_probs = clf_pipeline.predict_proba(Xc_test)[:,1]


Loaded Churn classification pipeline.


In [111]:
# --------------------------- 6B. Churn Classification ---------------------------
print("\n----- Churn Classification Modeling -----\n")
X_clf = df_model[num_features + cat_features]
y_clf = df_model['ChurnFlag']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_clf, y_clf, test_size=0.2, stratify=y_clf, random_state=RANDOM_STATE)
print("Train/Test shapes (classification):", Xc_train.shape, Xc_test.shape)

clf_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('model', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1))
])

# Fit classifier
clf_pipeline.fit(Xc_train, yc_train)
clf_preds = clf_pipeline.predict(Xc_test)
clf_probs = clf_pipeline.predict_proba(Xc_test)[:,1]

acc = accuracy_score(yc_test, clf_preds)
auc = roc_auc_score(yc_test, clf_probs)
print(f"Churn Classification Accuracy: {acc:.4f}")
print(f"Churn Classification AUC: {auc:.4f}")

# Classification report and confusion matrix
print("\nClassification Report:\n", classification_report(yc_test, clf_preds))
cm = confusion_matrix(yc_test, clf_preds)
print("Confusion Matrix:\n", cm)

# Save classifier
joblib.dump(clf_pipeline, os.path.join(OUTPUT_DIR, 'churn_classifier.joblib'))
print("Saved churn classification pipeline.")
print()


----- Churn Classification Modeling -----

Train/Test shapes (classification): (5634, 22) (1409, 22)
Churn Classification Accuracy: 0.7871
Churn Classification AUC: 0.8261

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.63      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.70      1409
weighted avg       0.77      0.79      0.78      1409

Confusion Matrix:
 [[928 107]
 [193 181]]
Saved churn classification pipeline.



In [112]:
# --------------------------- 7. Feature Importance (from RandomForest) ---------------------------
# Extract feature names after preprocessing (one-hot encoding)
ohe = clf_pipeline.named_steps['preproc'].named_transformers_['cat'].named_steps['onehot']
ohe_features = ohe.get_feature_names_out(cat_features)
feature_names = np.concatenate([num_features, ohe_features])

rf_model = clf_pipeline.named_steps['model']
importances = rf_model.feature_importances_
feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(25)
feat_imp_csv = os.path.join(OUTPUT_DIR, 'feature_importances_top25.csv')
feat_imp.to_csv(feat_imp_csv, index=False)
print("Saved top feature importances to CSV:", feat_imp_csv)

# Plot top features
fig = plt.figure(figsize=(8,6))
sns.barplot(data=feat_imp, x='importance', y='feature')
plt.title('Top 25 Feature Importances (Churn RF)')
save_fig(fig, 'feature_importances_top25.png')

print()


Saved top feature importances to CSV: telco_project_output\feature_importances_top25.csv



In [113]:
# --------------------------- 8. Model Evaluation - More Metrics ---------------------------
# ROC Curve plot
fpr, tpr, thresholds = roc_curve(yc_test, clf_probs)
fig = plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Churn Classifier')
plt.legend()
save_fig(fig, 'roc_curve_churn.png')

# Residuals plot CLV regression
residuals = yr_test - reg_preds
fig = plt.figure(figsize=(6,4))
plt.scatter(reg_preds, residuals, alpha=0.4)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel('Predicted CLV')
plt.ylabel('Residuals')
plt.title('CLV Regression Residuals')
save_fig(fig, 'clv_residuals.png')

In [114]:
# --------------------------- 9. Business Insights & Sample Policy ---------------------------
insights = [
    "High monthly charges + low tenure -> elevated churn risk. Prioritize these for retention offers.",
    "Customers on month-to-month contracts churn more. Offer discounted annual/one-year bundles to reduce churn.",
    "Auto-pay (BankTransfer/CreditCard) customers show lower churn; incentivize electronic payments.",
    "Segment customers by predicted CLV to create tiered retention budgets (High CLV -> VIP retention).",
    "Use predicted churn probability to drive intervention prioritization (top X% get immediate offers)."
]
insights_path = os.path.join(OUTPUT_DIR, 'business_insights.txt')
with open(insights_path, 'w') as f:
    f.write('\n'.join(insights))
print("Saved business insights file:", insights_path)
print()

Saved business insights file: telco_project_output\business_insights.txt



In [115]:
# --------------------------- 10. Save outputs summary ---------------------------
summary = {
    'rows': int(df.shape[0]),
    'clv_rmse': float(reg_rmse),
    'clv_r2': float(reg_r2),
    'churn_accuracy': float(acc),
    'churn_auc': float(auc),
    'models_saved': ['clv_regressor.joblib', 'churn_classifier.joblib']
}

summary_path = os.path.join(OUTPUT_DIR, 'summary_metrics.json')
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print("Saved summary metrics to JSON:", summary_path)
print()

print('All outputs (models, figures, CSVs, and summary) saved in:', OUTPUT_DIR)

Saved summary metrics to JSON: telco_project_output\summary_metrics.json

All outputs (models, figures, CSVs, and summary) saved in: telco_project_output
