In [68]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Feature selection
from sklearn.feature_selection import RFECV

# Models
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_recall_curve,
    precision_recall_fscore_support
)

import joblib


## Load Final Encoded Dataset

In [55]:
data_path = "D:/Sajid/Chrun_Prediction/data/processed/final_encoded_data.csv"
df_encoded = pd.read_csv(data_path)
print("Encoded Data Shape:", df_encoded.shape)

Encoded Data Shape: (7032, 43)


In [56]:
#Split into X and y
X = df_encoded.drop(columns=["Churn"])
y = df_encoded["Churn"]

## Train/Test Split

- This is to apply smote only on training data to in order to avoide any data leakage

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Address Class Imbalance using SMOTE

- The dataset is imbalanced (churn rate ~26%). We apply SMOTE to oversample the minority class (churners) in the training set.

In [58]:
#Apply SMOTE to Training Data
smote = SMOTE(sampling_strategy=0.75, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

## Feature Scaling
We apply:
- **StandardScaler** to 'tenure' and 'MonthlyCharges'
- **Yeo-Johnson + StandardScaler** to 'TotalCharges' (handles skewness and non-negative values)

Scaling is important for models like Logistic Regression.

In [59]:
numeric_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

# Create copies to preserve raw data
X_train_scaled = X_train_resampled.copy()
X_test_scaled = X_test.copy()

# Scale tenure and MonthlyCharges using StandardScaler
scaler = StandardScaler()
X_train_scaled[["tenure", "MonthlyCharges"]] = scaler.fit_transform(
    X_train_resampled[["tenure", "MonthlyCharges"]]
)
X_test_scaled[["tenure", "MonthlyCharges"]] = scaler.transform(
    X_test[["tenure", "MonthlyCharges"]]
)

# Scale TotalCharges using Yeo-Johnson + StandardScaler
pt = PowerTransformer(method="yeo-johnson")
X_train_scaled[["TotalCharges"]] = pt.fit_transform(X_train_resampled[["TotalCharges"]])
X_test_scaled[["TotalCharges"]] = pt.transform(X_test[["TotalCharges"]])

scaler_tc = StandardScaler()
X_train_scaled[["TotalCharges"]] = scaler_tc.fit_transform(X_train_scaled[["TotalCharges"]])
X_test_scaled[["TotalCharges"]] = scaler_tc.transform(X_test_scaled[["TotalCharges"]])


X_train_scaled.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,is_long_term_contract,...,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-2yrs,tenure_group_2-4yrs,tenure_group_4-5yrs,tenure_group_5+yrs
0,1,0,1,1,1.479047,1,0,0.940327,1.39503,1,...,0,0,1,1,0,0,0,0,0,1
1,1,0,0,0,-0.134604,0,0,-1.081816,-0.105905,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,1,0,1.603174,1,0,0.790729,1.431336,1,...,0,0,1,1,0,0,0,0,0,1
3,1,0,0,0,-1.086244,1,0,0.587827,-0.961772,0,...,1,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0.817036,0,0,-0.913304,0.417466,0,...,0,0,0,0,0,0,0,0,1,0


##   Feature Selection using RFECV

- We use Recursive Feature Elimination with Cross Validation (RFECV) to select the most informative features, optimizing for ROC AUC.

In [60]:
# Initialize logistic regression model
model = LogisticRegression(max_iter=500, solver="liblinear")

# Perform RFECV
rfecv = RFECV(estimator=model, step=1, cv=5, scoring="roc_auc")
rfecv.fit(X_train_scaled, y_train_resampled)

# Select the best features from RFECV
rfecv_mask = rfecv.support_
rfecv_ranks = rfecv.ranking_
rfecv_features = X_train_scaled.columns[rfecv_mask].tolist()
rfecv_rank_series = pd.Series(rfecv_ranks, index=X_train_scaled.columns)

## Drop Highly Correlated Features (among selected)

- We remove highly correlated features (corr > 0.8), keeping the more important one based on RFECV ranking.

In [61]:
# Check correlations among RFECV-selected features
corr_matrix = X_train_scaled[rfecv_features].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find and drop highly correlated features
to_drop = set()
for col1 in upper_tri.columns:
    for col2 in upper_tri.index:
        if upper_tri.loc[col2, col1] > 0.8:
            if rfecv_rank_series[col1] > rfecv_rank_series[col2]:
                to_drop.add(col1)
            else:
                to_drop.add(col2)

# Final selected features
final_features = [f for f in rfecv_features if f not in to_drop]

# Filter dataset
X_train_selected = X_train_scaled[final_features]
X_test_selected = X_test_scaled[final_features]

print(f"Final Selected Features after RFECV + Corr Filter: {X_train_selected.shape[1]}")
print('\n', final_features)

Final Selected Features after RFECV + Corr Filter: 16

 ['PhoneService', 'TotalCharges', 'total_services', 'has_bundle', 'low_tenure_high_charge', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes', 'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year']


## Grid Search for Best Logistic Regression Parameters

We create a pipeline that applies:
- SMOTE (to handle class imbalance)
- MinMaxScaler (to scale features)
- Logistic Regression

Then we perform GridSearchCV using StratifiedKFold and score on **F1** to find the best model.

In [62]:
# Define model
log_reg = LogisticRegression(random_state=42)

# Define hyperparameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['liblinear', 'lbfgs', 'saga', 'newton-cg'],  # solvers
    #'classifier__penalty': ['l1', 'l2'],  # Regularization type (L1 only works with liblinear & saga)
    'classifier__max_iter': [100,500, 1000, 2000]  # Convergence iterations
}

# Use Stratified K-Fold for balanced class representation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a pipeline to apply SMOTE, Scaling, and Feature Selection inside each fold
pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy=0.6, random_state=42)),  # Apply SMOTE inside GridSearch
    ('scaler', MinMaxScaler()),  # Apply Feature Scaling inside each fold
    ('classifier', LogisticRegression(random_state=42))  # Model Training
])

# Perform GridSearchCV on the original training data (SMOTE, scaling, selection applied inside each fold)
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='f1', n_jobs=-1)
grid_search.fit(X_train[final_features], y_train)  #  Use original data, all transformations applied inside GridSearch

# Best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)



Best Parameters: {'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__solver': 'lbfgs'}


## Final Model Training with Best Parameters

- We retrain the logistic regression model using best hyperparameters on the full SMOTE - resampled + scaled training data.

In [65]:
# Extract best parameters and remove the 'classifier__' prefix
best_params_cleaned = {key.replace('classifier__', ''): value for key, value in best_params.items()}
print(best_params_cleaned)

best_params_cleaned['class_weight'] = 'balanced'


# Train final model with best parameters
final_model = LogisticRegression(**best_params_cleaned, random_state=42)
final_model.fit(X_train_selected, y_train_resampled)

 

{'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}


## Model Evaluation
- We compute precision, recall, and F1 score across different probability thresholds and select the one that maximizes F1.
Final performance metrics are reported using the optimized threshold.

In [67]:

y_test_pred_proba = final_model.predict_proba(X_test_selected)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"\nBest threshold: {best_threshold:.4f}")

y_test_pred_labels = (y_test_pred_proba >= best_threshold).astype(int)

auc = roc_auc_score(y_test, y_test_pred_proba)
accuracy = accuracy_score(y_test, y_test_pred_labels)
report = classification_report(y_test, y_test_pred_labels)

print(f"Accuracy: {accuracy:.4f}")
print(f"Final Test Set AUC: {auc:.4f}")
print("Classification Report:\n", report)


Best threshold: 0.5413
Accuracy: 0.7719
Final Test Set AUC: 0.8318
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.79      0.84      1033
           1       0.55      0.72      0.63       374

    accuracy                           0.77      1407
   macro avg       0.72      0.76      0.73      1407
weighted avg       0.80      0.77      0.78      1407



## Save Final Logistic Regression Model

- Save the trained model for deployment using `joblib`.

In [69]:
joblib.dump(final_model, "D:/Sajid/Chrun_Prediction/deployment/final_logreg_model.pkl")

['D:/Sajid/Chrun_Prediction/deployment/final_logreg_model.pkl']