
"""

PROJECT: Customer Churn Prediction

TYPE: Supervised Machine Learning – Classification

OBJECTIVE:
- Predict which telecom customers are likely to churn

STEPS:
- Data Cleaning
- Encoding
- Scaling
- Model Training
- Model Comparison
- Evaluation

"""


In [95]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score


In [96]:
# Load Data

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df




Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [97]:
df.shape

(7043, 21)

In [98]:
df.drop("customerID", axis=1, inplace=True)

In [99]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [100]:
# Missing values
imputer = SimpleImputer(strategy="mean")
df["TotalCharges"] = imputer.fit_transform(df[["TotalCharges"]])


In [101]:
# Target distribution check
print("\nTarget Distribution:")
print(df["Churn"].value_counts(normalize=True))



Target Distribution:
Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64


In [102]:
# Encoding

df_encoded = pd.get_dummies(df, drop_first=True)



In [103]:
# Split Features / Target

X = df_encoded.drop("Churn_Yes", axis=1)
y = df_encoded["Churn_Yes"]


In [104]:
# Scaling

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [105]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

In [106]:
print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)


Train Shape: (4930, 30)
Test Shape: (2113, 30)


In [107]:
# Models

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(n_neighbors=15),
}

In [108]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    results[name] = {
        "Accuracy": acc,
        "F1 Score": f1
    }

    print(f"\n{name}")
    print("Accuracy:", acc)
    print("F1 Score:", f1)


LogisticRegression
Accuracy: 0.8121154756270705
F1 Score: 0.6244087038789026

DecisionTree
Accuracy: 0.7373402744912447
F1 Score: 0.5135845749342682

SVM
Accuracy: 0.804070042593469
F1 Score: 0.5835010060362174

KNN
Accuracy: 0.7870326549929011
F1 Score: 0.5825602968460112


In [109]:
best_model = max(results, key=lambda x: results[x]["F1 Score"])
print("Best Performing Model:", best_model)

Best Performing Model: LogisticRegression


In [110]:
# Model Comparison

results_df = pd.DataFrame(results).T

print("\nModel Comparison:\n")
print(results_df.sort_values(by="F1 Score", ascending=False))



Model Comparison:

                    Accuracy  F1 Score
LogisticRegression  0.812115  0.624409
SVM                 0.804070  0.583501
KNN                 0.787033  0.582560
DecisionTree        0.737340  0.513585


In [111]:
# Feature Importance (Tree)


tree = models["DecisionTree"]
importances = pd.Series(tree.feature_importances_, index=X.columns)
print("\nTop Important Features:")
print(importances.sort_values(ascending=False).head(10))



Top Important Features:
tenure                            0.219624
TotalCharges                      0.199120
MonthlyCharges                    0.181737
InternetService_Fiber optic       0.102699
Dependents_Yes                    0.024864
OnlineSecurity_Yes                0.024611
Partner_Yes                       0.024477
PaperlessBilling_Yes              0.022498
PaymentMethod_Electronic check    0.022419
SeniorCitizen                     0.021359
dtype: float64


In [112]:
# Business Insights

print("\nKey Business Insights:")

print("- Customers with shorter tenure are more likely to churn.")
print("- Month-to-month contract users show higher churn.")
print("- Higher monthly charges increase churn probability.")
print("- Long-term customers are more stable.")


Key Business Insights:
- Customers with shorter tenure are more likely to churn.
- Month-to-month contract users show higher churn.
- Higher monthly charges increase churn probability.
- Long-term customers are more stable.


In [113]:
print("""
FINAL CONCLUSION:
Multiple classification models were trained and compared.
Performance evaluated using precision, recall, and F1-score.
Decision Tree feature importance shows key churn drivers.
Model can help identify high-risk customers early.
""")



FINAL CONCLUSION:
Multiple classification models were trained and compared.
Performance evaluated using precision, recall, and F1-score.
Decision Tree feature importance shows key churn drivers.
Model can help identify high-risk customers early.

