# Churn Modeling (Milestone 4)
This notebook develops predictive models to analyze churn using Logistic Regression and Random Forest.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import sklearn
print('scikit-learn version:', sklearn.__version__)

In [None]:
# Load finalized dataset
df = pd.read_csv('finalized_churn_dataset.csv')
df.head()

In [None]:
# Features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

if sklearn.__version__ >= '1.2':
    cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
else:
    cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', cat_transformer, cat_cols)
    ],
    remainder='drop'
)
print('Preprocessor ready')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Logistic Regression with SMOTE
pipe_lr = ImbPipeline(steps=[('preproc', preprocessor),
                             ('smote', SMOTE(random_state=42)),
                             ('clf', LogisticRegression(max_iter=1000))])
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))
print('ROC AUC:', roc_auc_score(y_test, pipe_lr.predict_proba(X_test)[:,1]))

In [None]:
# Random Forest with SMOTE + Hyperparameter tuning
pipe_rf = ImbPipeline(steps=[('preproc', preprocessor),
                             ('smote', SMOTE(random_state=42)),
                             ('clf', RandomForestClassifier(random_state=42))])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [5, 10]
}

grid_rf = GridSearchCV(pipe_rf, param_grid, cv=3, scoring='roc_auc')
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)
print('Best Params:', grid_rf.best_params_)
print(classification_report(y_test, y_pred_rf))
print('ROC AUC:', roc_auc_score(y_test, grid_rf.predict_proba(X_test)[:,1]))

In [None]:
# ROC Curve comparison
y_pred_prob_lr = pipe_lr.predict_proba(X_test)[:,1]
y_pred_prob_rf = grid_rf.predict_proba(X_test)[:,1]

fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf)

plt.figure(figsize=(6,4))
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression')
plt.plot(fpr_rf, tpr_rf, label='Random Forest')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()