In [2]:
# Core libraries
import pandas as pd
import numpy as np

# Model
from xgboost import XGBClassifier

# Train-test split
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Handling imbalance
from imblearn.over_sampling import SMOTE


In [3]:
# Load your dataset (update path if needed)
df = pd.read_csv("D:\Workspace\Projects\CTS\Dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Quick look
print(df.head())
print(df["Churn"].value_counts(normalize=True))  # churn distribution


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [4]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(0, inplace=True)

# Drop CustomerID (not useful for model)
df.drop("customerID", axis=1, inplace=True)

# Encode categorical (get_dummies one-hot)
df = pd.get_dummies(df, drop_first=True)

# Define features & target
X = df.drop("Churn_Yes", axis=1)  # churn target encoded as "Churn_Yes"
y = df["Churn_Yes"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(0, inplace=True)


In [5]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_train_res))


Before SMOTE: [4139 1495]
After SMOTE : [4139 4139]


In [14]:
baseline_model = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

baseline_model.fit(X_train_res, y_train_res)

# ✅ use custom threshold 0.45
y_proba = baseline_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.45).astype(int)

# Metrics
print("Baseline Accuracy :", accuracy_score(y_test, y_pred))
print("Baseline F1       :", f1_score(y_test, y_pred))
print("Baseline ROC-AUC  :", roc_auc_score(y_test, y_proba))


Baseline Accuracy : 0.7615330021291696
Baseline F1       : 0.58
Baseline ROC-AUC  : 0.8132682321940634


In [8]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "n_estimators": [100, 200, 300],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(eval_metric="logloss", use_label_encoder=False),
    param_distributions=params,
    n_iter=20,     # test only 20 random combos
    scoring="f1",
    cv=3,
    n_jobs=2,
    verbose=2,
    random_state=42
)

random_search.fit(X_train_res, y_train_res)

best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [15]:
y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:,1]

print("Tuned Accuracy :", accuracy_score(y_test, y_pred_best))
print("Tuned F1       :", f1_score(y_test, y_pred_best))
print("Tuned ROC-AUC  :", roc_auc_score(y_test, y_proba_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))


Tuned Accuracy : 0.7672107877927609
Tuned F1       : 0.6104513064133017
Tuned ROC-AUC  : 0.8354969128626417

Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.80      0.83      1035
        True       0.55      0.69      0.61       374

    accuracy                           0.77      1409
   macro avg       0.71      0.74      0.72      1409
weighted avg       0.79      0.77      0.77      1409



In [12]:
thresholds = [0.3, 0.35, 0.4, 0.45, 0.5]
for t in thresholds:
    y_pred_adj = (y_proba_best >= t).astype(int)
    print(f"Threshold {t:.2f} → F1 = {f1_score(y_test, y_pred_adj):.4f}")


Threshold 0.30 → F1 = 0.5700
Threshold 0.35 → F1 = 0.5953
Threshold 0.40 → F1 = 0.6078
Threshold 0.45 → F1 = 0.6152
Threshold 0.50 → F1 = 0.6105


In [17]:
import joblib

# Save tuned model
joblib.dump(best_model, "telco_churn_model.pkl")
print("✅ Model saved as telco_churn_model.pkl")


✅ Model saved as telco_churn_model.pkl


In [16]:
# Final predictions with best threshold
best_threshold = 0.45  # update after Cell 8
y_pred_final = (y_proba_best >= best_threshold).astype(int)

# Create output DataFrame
output = pd.DataFrame({
    "Phone Number": df.index[X_test.index],   # replace if phone column exists
    "Churn_Prediction": y_pred_final,
    "Churn_Probability": y_proba_best,
    "Usage_Category": np.where(X_test["MonthlyCharges"] > 70, "High", "Low"),
    "Recommended_Products": np.where(y_pred_final==1, "Discount Offer", "Upgrade Plan")
})

# Save to Excel
output.to_excel("Telco_Churn_Predictions.xlsx", index=False)
print("✅ Predictions exported to Telco_Churn_Predictions.xlsx")


✅ Predictions exported to Telco_Churn_Predictions.xlsx
