## Importing the dataset and libraries

In [None]:
# !git clone "https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set plot style
sns.set_style('whitegrid')

In [None]:
# Load the dataset from the user-provided file
df = pd.read_csv('/content/21-Days-21-Projects-Dataset/Datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("Dataset loaded successfully.")
print(f"Data shape: {df.shape}")
df.head()

Dataset loaded successfully.
Data shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**Key Problem Identified:** The `TotalCharges` column, which should be numerical, is currently an `object` type. This indicates there are non-numeric values in it. We need to fix this.

In [None]:
print(f"Shape before cleaning: {df.shape}")

# Convert TotalCharges to numeric, coercing errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"Shape after converting TotalCharges to numeric: {df.shape}")


# Find how many rows have missing TotalCharges
print(f"Number of missing TotalCharges: {df['TotalCharges'].isnull().sum()}")

# Impute the missing values with the median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
print(f"Shape after imputing TotalCharges: {df.shape}")


# Convert target variable 'Churn' to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print(f"Shape after converting Churn to binary: {df.shape}")


# Drop rows with missing Churn values
df.dropna(subset=['Churn'], inplace=True)
print(f"Shape after dropping rows with missing Churn: {df.shape}")


# Drop customerID as it's not a predictive feature
# df.drop('customerID', axis=1, inplace=True) # This line is commented out as customerID is already dropped

print("\nData cleaning complete.")

Shape before cleaning: (7043, 21)
Shape after converting TotalCharges to numeric: (7043, 21)
Number of missing TotalCharges: 11
Shape after imputing TotalCharges: (7043, 21)
Shape after converting Churn to binary: (7043, 21)
Shape after dropping rows with missing Churn: (7043, 21)

Data cleaning complete.


In [None]:
df['Churn'].value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,5174
1,1869


### Model 1 - Baseline Performance (Without Feature Engineering)
First, we'll build a model using only the original, cleaned features. This will serve as our benchmark to see if our feature engineering efforts actually help.

In [None]:
# Define features (X) and target (y)
X_base = df.drop('Churn', axis=1)
y_base = df['Churn']

# Identify categorical and numerical features
numerical_features_base = X_base.select_dtypes(include=np.number).columns.tolist()
categorical_features_base = X_base.select_dtypes(include=['object']).columns.tolist()

# Create the preprocessing pipeline
preprocessor_base = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_base),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_base)])

# Split data
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.2, random_state=42, stratify=y_base)

# Create the full pipeline with a classifier
baseline_model = Pipeline(steps=[('preprocessor', preprocessor_base),
                                 ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

# Train and evaluate the baseline model
baseline_model.fit(X_train_base, y_train_base)
y_pred_base = baseline_model.predict(X_test_base)

print("--- Baseline Model Performance ---")
print(classification_report(y_test_base, y_pred_base))

--- Baseline Model Performance ---
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.65      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409



## Feature engineering


In [None]:
# Create Monthly_Total_Ratio
df['Monthly_Total_Ratio'] = df['TotalCharges'] / df['MonthlyCharges']
df['Monthly_Total_Ratio'] = df['Monthly_Total_Ratio'].replace([np.inf, -np.inf], np.nan)
df['Monthly_Total_Ratio'] = df['Monthly_Total_Ratio'].fillna(df['Monthly_Total_Ratio'].median())

# Create Tenure_Monthly_Interaction
df['Tenure_Monthly_Interaction'] = df['tenure'] * df['MonthlyCharges']

# Create Has_Internet
df['Has_Internet'] = (df['InternetService'] != 'No').astype(int)

# Create Has_PhoneService
df['Has_PhoneService'] = (df['PhoneService'] != 'No').astype(int)

# Create features for different contract lengths (One-Hot Encoding for Contract)
df = pd.get_dummies(df, columns=['Contract'], prefix='Contract', drop_first=False)

# Create Is_Senior_Partner
df['Is_Senior_Partner'] = ((df['SeniorCitizen'] == 1) & (df['Partner'] == 'Yes')).astype(int)

# Create Has_Multiple_Services
service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df['Has_Multiple_Services'] = df[service_cols].apply(lambda row: (row == 'Yes').sum(), axis=1)

print("New features created:")
display(df.head())

New features created:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,Churn,Monthly_Total_Ratio,Tenure_Monthly_Interaction,Has_Internet,Has_PhoneService,Contract_Month-to-month,Contract_One year,Contract_Two year,Is_Senior_Partner,Has_Multiple_Services
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,0,1.0,29.85,1,0,True,False,False,0,1
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,0,33.178227,1936.3,1,1,False,True,False,0,2
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,1,2.008357,107.7,1,1,True,False,False,0,2
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,0,43.516548,1903.5,1,0,False,True,False,0,3
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,1,2.144979,141.4,1,1,True,False,False,0,0


## Feature selection


In [None]:
# 1. Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# 2. Identify categorical and numerical features
# Exclude 'customerID' from features as it's an identifier and not predictive
X = X.drop('customerID', axis=1)
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# 3. Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# 4. Define a pipeline with Logistic Regression
feature_selection_model = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

# 5. Train the pipeline
feature_selection_model.fit(X, y)

# 6. Access the trained logistic regression model
logistic_model = feature_selection_model.named_steps['classifier']

# 7. Extract the coefficients
coefficients = logistic_model.coef_[0]

# 8. Create a list of feature names
# Get the feature names from the one-hot encoder
ohe_feature_names = feature_selection_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
# Combine numerical and one-hot encoded feature names
feature_names = numerical_features + ohe_feature_names.tolist()

# 9. Create a pandas Series of coefficients with feature names
feature_importance = pd.Series(coefficients, index=feature_names)

# 10. Sort the features by the absolute value of their coefficients
sorted_feature_importance = feature_importance.abs().sort_values(ascending=False)

# 11. Print the sorted feature coefficients
print("Feature Importance (based on absolute logistic regression coefficients):")
print(feature_importance[sorted_feature_importance.index])

Feature Importance (based on absolute logistic regression coefficients):
Monthly_Total_Ratio                       -0.988459
tenure                                    -0.671235
InternetService_DSL                       -0.599097
InternetService_Fiber optic                0.509162
TotalCharges                               0.458106
MonthlyCharges                            -0.343525
PaperlessBilling_No                       -0.325504
MultipleLines_No                          -0.285300
Has_Internet                               0.279412
TechSupport_Yes                           -0.271559
OnlineSecurity_Yes                        -0.252872
PaymentMethod_Electronic check             0.244154
Dependents_Yes                            -0.234060
PhoneService_Yes                          -0.232166
Tenure_Monthly_Interaction                 0.226391
PaymentMethod_Credit card (automatic)     -0.213850
StreamingMovies_No                        -0.184029
TechSupport_No                             

## Model selection and training


In [None]:
from sklearn.feature_selection import SelectFromModel
import lightgbm

# Splitting the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the preprocessor (scaling and one-hot encoding)
# This preprocessor will be applied before feature selection
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Definining a list of different classification models
models = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('Lightgbm', lightgbm.LGBMClassifier(random_state = 50)),
    ('Catboost', CatBoostClassifier(random_state=42, verbose=0)),
    ('SVM', SVC(random_state=42))
]

# Iterate through the list of models
for name, classifier in models:
    print(f"--- Training and Evaluating: {name} ---")

    temp_lr_selector = LogisticRegression(random_state=42, max_iter=1000)
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('feature_selection', SelectFromModel(temp_lr_selector, threshold=0.1, max_features=None)),
                                     ('classifier', classifier)])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("-" * 40)

--- Training and Evaluating: Logistic Regression ---
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

----------------------------------------
--- Training and Evaluating: Random Forest ---
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1035
           1       0.59      0.49      0.54       374

    accuracy                           0.78      1409
   macro avg       0.71      0.68      0.69      1409
weighted avg       0.76      0.78      0.77      1409

----------------------------------------
--- Training and Evaluating: XGBoost ---
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1035
           1  



              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1409
   macro avg       0.74      0.70      0.72      1409
weighted avg       0.78      0.79      0.79      1409

----------------------------------------
--- Training and Evaluating: Catboost ---
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.52      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409

----------------------------------------
--- Training and Evaluating: SVM ---
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1035
           1       0.68      0.46      0.55       374

    accuracy         

## Hyperparameter tuning



In [None]:
# Define parameter grid for XGBoost
param_grid_xgb = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.7, 0.9, 1.0],
    'classifier__colsample_bytree': [0.7, 0.9, 1.0]
}

# Define parameter grid for CatBoost
param_grid_cat = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__l2_leaf_reg': [1, 3, 5]
}

# Define parameter grid for LightGBM
param_grid_lgbm = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, -1],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__num_leaves': [31, 50, 100]
}

# Define parameter grid for SVC
param_grid_svc = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Create a GridSearchCV object for XGBoost
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', XGBClassifier(random_state=42))])

grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=5, scoring='recall', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_xgb.fit(X_train, y_train)

# Print the best hyperparameters found
print("\nBest hyperparameters for XGBoost:")
print(grid_search_xgb.best_params_)


Best hyperparameters for XGBoost:
{'classifier__colsample_bytree': 0.7, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__subsample': 0.7}


In [None]:
from catboost import CatBoostClassifier

# Create a GridSearchCV object for CatBoost
cat_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', CatBoostClassifier(random_state=42, verbose=0))])

grid_search_cat = GridSearchCV(cat_pipeline, param_grid_cat, cv=5, scoring='recall', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_cat.fit(X_train, y_train)

# Print the best hyperparameters found
print("\nBest hyperparameters for CatBoost:")
print(grid_search_cat.best_params_)

Starting GridSearchCV for CatBoost...
GridSearchCV for CatBoost finished.

Best hyperparameters for CatBoost:
{'classifier__depth': 3, 'classifier__l2_leaf_reg': 1, 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}


In [None]:
import lightgbm

# Create a GridSearchCV object for LightGBM
lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', lightgbm.LGBMClassifier(random_state=42))])

grid_search_lgbm = GridSearchCV(lgbm_pipeline, param_grid_lgbm, cv=5, scoring='recall', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_lgbm.fit(X_train, y_train)

# Print the best hyperparameters found
print("\nBest hyperparameters for LightGBM:")
print(grid_search_lgbm.best_params_)

Starting GridSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 1495, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265353 -> initscore=-1.018328
[LightGBM] [Info] Start training from score -1.018328
GridSearchCV for LightGBM finished.

Best hyperparameters for LightGBM:
{'classifier__learning_rate': 0.2, 'classifier__max_depth': 10, 'classifier__n_estimators': 100, 'classifier__num_leaves': 31}


In [None]:
from sklearn.svm import SVC

# Create a GridSearchCV object for SVC
svc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', SVC(random_state=42))])

grid_search_svc = GridSearchCV(svc_pipeline, param_grid_svc, cv=5, scoring='recall', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_svc.fit(X_train, y_train)

# Print the best hyperparameters found
print("\nBest hyperparameters for SVC:")
print(grid_search_svc.best_params_)

Starting GridSearchCV for SVC...
GridSearchCV for SVC finished.

Best hyperparameters for SVC:
{'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']
}

# Create a GridSearchCV object for Logistic Regression
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

grid_search_lr = GridSearchCV(lr_pipeline, param_grid_lr, cv=5, scoring='recall', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_lr.fit(X_train, y_train)

# Print the best hyperparameters found
print("\nBest hyperparameters for Logistic Regression:")
print(grid_search_lr.best_params_)

Starting GridSearchCV for Logistic Regression...
GridSearchCV for Logistic Regression finished.

Best hyperparameters for Logistic Regression:
{'classifier__C': 100, 'classifier__solver': 'liblinear'}


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object for Random Forest
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', RandomForestClassifier(random_state=42))])

grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='recall', n_jobs=-1)

grid_search_rf.fit(X_train, y_train)
print(grid_search_rf.best_params_)

Starting GridSearchCV for Random Forest...
GridSearchCV for Random Forest finished.

Best hyperparameters for Random Forest:
{'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}


## Model evaluation and comparison


In [None]:
# Get the best performing of the model list.
best_lr_model = grid_search_lr.best_estimator_
best_rf_model = grid_search_rf.best_estimator_
best_svc_model = grid_search_svc.best_estimator_
best_lgbm_model = grid_search_lgbm.best_estimator_
best_cat_model = grid_search_cat.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_

best_models = [
    ('Logistic Regression', best_lr_model),
    ('Random Forest', best_rf_model),
    ('SVM', best_svc_model),
    ('LightGBM', best_lgbm_model),
    ('CatBoost', best_cat_model),
    ('XGBoost', best_xgb_model)
]

# Evaluate each best model on the test set
for name, model in best_models:
  print(f"--- Best {name} Model Performance on Test Set ---")
  y_pred = model.predict(X_test)
  print(classification_report(y_test, y_pred))
  print("-" * 40)

--- Best Logistic Regression Model Performance on Test Set ---
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

----------------------------------------
--- Best Random Forest Model Performance on Test Set ---
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.51      0.57       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

----------------------------------------
--- Best SVM Model Performance on Test Set ---
              precision    recall  f1-score   support

           0       0.84      0.89      0.8



## Analysis & Report:

### Approaches Tried:

1.  **Baseline Model**: Trained a Logistic Regression model on the original, cleaned dataset without any feature engineering or selection.
2.  **Feature Engineering**: Created several new features, including:
    *   `Monthly_Total_Ratio`: Ratio of `TotalCharges` to `MonthlyCharges`.
    *   `Tenure_Monthly_Interaction`: Interaction term between `tenure` and `MonthlyCharges`.
    *   `Has_Internet`: Binary flag indicating if the customer has internet service.
    *   `Has_PhoneService`: Binary flag indicating if the customer has phone service.
    *   One-hot encoded `Contract` types.
    *   `Is_Senior_Partner`: Binary flag indicating if the customer is both a senior citizen and a partner.
    *   `Has_Multiple_Services`: Count of additional services (Online Security, Online Backup, Device Protection, Tech Support, Streaming TV, Streaming Movies).
3.  **Feature Selection**: Used `SelectFromModel` with a Logistic Regression estimator within the pipeline to select features based on coefficients, applying this after preprocessing.
4.  **Model Selection and Training**: Evaluated several classification models: Logistic Regression, Random Forest, XGBoost, LightGBM, CatBoost, and SVM.
5.  **Hyperparameter Tuning**: Performed GridSearchCV with 5-fold cross-validation for each model, optimizing for 'recall' to improve the identification of churned customers.

### Model Performance Comparison (on Test Set - Optimized for Recall):

| Model                 | Precision (Class 1) | Recall (Class 1) | F1-Score (Class 1) | Accuracy |
| :-------------------- | :------------------ | :--------------- | :----------------- | :------- |
| Logistic Regression   | 0.66                | 0.53             | 0.59               | 0.80     |
| Random Forest         | 0.65                | 0.51             | 0.57               | 0.80     |
| SVM                   | 0.64                | 0.53             | 0.58               | 0.79     |
| LightGBM              | 0.61                | 0.51             | 0.56               | 0.78     |
| CatBoost              | 0.68                | 0.53             | 0.60               | 0.81     |
| XGBoost               | 0.58                | 0.47             | 0.52               | 0.77     |


### Discussion:

*   **Effectiveness of Techniques**:
    *   Feature engineering and hyperparameter tuning, while implemented, did not lead to a substantial improvement in recall for the churn class compared to the baseline Logistic Regression model (which had a recall of 0.56).
    *   Among the evaluated models, **CatBoost** performed slightly better in terms of F1-score and accuracy on the test set with the chosen hyperparameter tuning strategy focused on recall. Logistic Regression and SVM also showed comparable recall and F1-scores.
    *   Feature selection using `SelectFromModel` with a Logistic Regression base seems to have retained most features, as indicated by the performance not drastically changing from the initial model evaluations. Experimenting with a stricter threshold for feature selection could be explored.

*   **Most Important Features**:
    *   Based on the absolute coefficients from the initial Logistic Regression feature importance analysis (`cell 5058d8a2`), features with the largest influence on churn prediction included: `Monthly_Total_Ratio`, `tenure`, `InternetService_DSL`, `InternetService_Fiber optic`, and `TotalCharges`. These features likely capture key aspects of customer engagement, service usage, and financial commitment.

*   **Limitations**:
    *   The dataset is imbalanced, with significantly more non-churn customers than churn customers. While `stratify` was used in splitting, the models may still struggle to accurately predict the minority class (churn).
    *   The hyperparameter tuning focused solely on 'recall'. While important for identifying potential churners, a balance between precision and recall (e.g., optimizing for F1-score) might be more suitable depending on the business objective and cost of false positives vs. false negatives.
    *   The feature selection approach was based on linear model coefficients. Tree-based models might benefit from feature selection methods that consider non-linear relationships.
    *   The feature engineering was exploratory. Domain expertise or more in-depth analysis of feature interactions could yield more impactful features.