### Project 03: Ensemble Learning and Model Comparison
* Train and compare multiple ensemble model on real world dataset
* anaylyzing their performance under balanced and imbalanced conditions

In [17]:
# Required Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [5]:
# Load and Preview Dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

print("--- Sample Data ---\n",df.head(3))
print("\n--- Dataset Information ---\n")
print(df.info())
print("\n--- Class Distribution ---\n")
print(df["Churn"].value_counts())

--- Sample Data ---
    customerID  gender  SeniorCitizen Partner  ...     PaymentMethod  MonthlyCharges TotalCharges Churn
0  7590-VHVEG  Female              0     Yes  ...  Electronic check           29.85        29.85    No
1  5575-GNVDE    Male              0      No  ...      Mailed check           56.95       1889.5    No
2  3668-QPYBK    Male              0      No  ...      Mailed check           53.85       108.15   Yes

[3 rows x 21 columns]

--- Dataset Information ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non

In [7]:
# Handling Missing Data  
# for:  19  TotalCharges      7043 non-null   object 
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.fillna({"TotalCharges": df["TotalCharges"].median()}, inplace=True)
df["TotalCharges"].head(3)

0      29.85
1    1889.50
2     108.15
Name: TotalCharges, dtype: float64

In [9]:
# Encode Categorical Variables
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=["object"]).columns:
    if column != 'Churn':
        df[column] = label_encoder.fit_transform(df[column])

# Encode the Target Variable
df['Churn'] = label_encoder.fit_transform(df['Churn'])

# Scale Numerical Features
scaler = StandardScaler()
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [11]:
# Selecting Features and Target
X = df.drop(columns='Churn')
y = df['Churn']

# SPlit Datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display Class Distribution
print("\n--- Class Distribution Before SMOTE ---\n")
print(y_train.value_counts())

print("\n--- Class Distribution After SMOTE ---\n")
print(y_train_resampled.value_counts())


--- Class Distribution Before SMOTE ---

Churn
0    4138
1    1496
Name: count, dtype: int64

--- Class Distribution After SMOTE ---

Churn
0    4138
1    4138
Name: count, dtype: int64


In [21]:
# Train and Predict Random Forest Classifier
model_rfc = RandomForestClassifier(random_state=42)
model_rfc.fit(X_train_resampled, y_train_resampled)
y_pred_rfc = model_rfc.predict(X_test)
y_proba_rfc = model_rfc.predict_proba(X_test)[:, 1]

# Evaluate Random Forest
roc_auc_rfc = roc_auc_score(y_test, y_pred_rfc)
cls_rfc = classification_report(y_test, y_pred_rfc)

In [27]:
# Train and Predict XGBoost 
model_xgb = XGBClassifier(eval_metric='logloss', random_state=42)
model_xgb.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = model_xgb.predict(X_test)
y_proba_xgb = model_xgb.predict_proba(X_test)[: , 1]

# Evaluate XGBoost Forest
roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
cls_xgb = classification_report(y_test, y_pred_xgb)

In [23]:
# Train and Predict LightGBM
model_lgb = LGBMClassifier(random_state=42)
model_lgb.fit(X_train_resampled, y_train_resampled)
y_pred_lgb = model_lgb.predict(X_test)
y_proba_lgb = model_lgb.predict_proba(X_test)[:,1]

# Evaluate LightGBM
roc_auc_lgb = roc_auc_score(y_test, y_proba_lgb)
cls_lgb = classification_report(y_test, y_pred_lgb)

[LightGBM] [Info] Number of positive: 4138, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 8276, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [26]:
# Train and Predict CatBoost
model_cat = CatBoostClassifier(random_state=42, verbose=0)
model_cat.fit(X_train_resampled, y_train_resampled)
y_pred_cat = model_cat.predict(X_test)
y_proba_cat = model_cat.predict_proba(X_test)[:, 1]

# Evaluate CatBoost Classifier
roc_auc_cat = roc_auc_score(y_test, y_proba_cat)
cls_cat = classification_report(y_test, y_pred_cat)

In [29]:
# Display Classification report
print("--- Random Forest (Bagging) Classification Report ---\n", cls_rfc)
print("\n--- Extreme Gradient Boosting (XGB) Classification Report ---\n", cls_xgb)
print("\n--- Light Gradient Boosting Machine (LGB) Classification Report ---\n", cls_lgb)
print("\n--- Categorical Boosting (CatBoost) Classification Report ---\n", cls_cat)

--- Random Forest (Bagging) Classification Report ---
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1036
           1       0.59      0.59      0.59       373

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409


--- Extreme Gradient Boosting (XGB) Classification Report ---
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      1036
           1       0.61      0.61      0.61       373

    accuracy                           0.79      1409
   macro avg       0.74      0.73      0.74      1409
weighted avg       0.79      0.79      0.79      1409


--- Light Gradient Boosting Machine (LGB) Classification Report ---
               precision    recall  f1-score   support

           0       0.86      0.87      0.87      1036
           1       0.63      0.61      0.62     

In [33]:
# Display ROC-AUC Scores(Receiver Operating Characteristic â€“ Area Under Curve
# Display Classification report
print("--- Random Forest Classifier ---\n", roc_auc_rfc)
print("\n--- Extreme Gradient Boosting (XGB) ---\n", roc_auc_xgb)
print("\n--- Light Gradient Boosting Machine (LGB) ---\n", roc_auc_lgb)
print("\n--- Categorical Boosting (CatBoost) ---\n", roc_auc_cat)

--- Random Forest Classifier ---
 0.7228875754344922

--- Extreme Gradient Boosting (XGB) ---
 0.838378688914882

--- Light Gradient Boosting Machine (LGB) ---
 0.8439101203846513

--- Categorical Boosting (CatBoost) ---
 0.8428245365242685


In [31]:
"""LightGBM performed the best overall, both in Accuracy (0.80) and ROC-AUC (0.8439)."""

'LightGBM performed the best overall, both in Accuracy (0.80) and ROC-AUC (0.8439).'