In [None]:
import numpy as np
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Step 1: Load and Preprocess Data
df = pd.read_csv('/kaggle/input/e-commerce-customer-churn/data_ecommerce_customer_churn.csv')

# Encode categorical features
label_encoder = LabelEncoder()
df['PreferedOrderCat'] = label_encoder.fit_transform(df['PreferedOrderCat'])
df['MaritalStatus'] = label_encoder.fit_transform(df['MaritalStatus'])

# Handle missing values
df.fillna(df.select_dtypes(include=[np.number]).mean(), inplace=True)

# Feature scaling
scaler = StandardScaler()
features_to_scale = ['Tenure', 'WarehouseToHome', 'NumberOfDeviceRegistered', 'SatisfactionScore',
                     'NumberOfAddress', 'DaySinceLastOrder', 'CashbackAmount']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Feature Engineering
#df['DaysSinceLastPurchase'] = (pd.to_datetime('today') - pd.to_datetime(df['LastPurchaseDate'])).dt.days
df['Tenure_Satisfaction'] = df['Tenure'] * df['SatisfactionScore']

# Features and target
X = df.drop(['Churn'], axis=1)
y = df['Churn']

# Step 2: Handle Class Imbalance
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X, y)

# Step 3: Feature Selection
selector = SelectKBest(f_classif, k='all')
X_res_selected = selector.fit_transform(X_res, y_res)
X_selected_columns = X_res.columns[selector.get_support()]

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_res_selected, y_res, test_size=0.2, random_state=42)
np.random.seed(42)

# Step 5: Initialize Models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': GradientBoostingClassifier(random_state=42)  # Replacing Decision Tree with GB for better meta-learner
}

# Step 6: Hyperparameter Tuning (optional)
def tune_model(model, param_grid, X_train, y_train):
    try:
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=0)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_, grid_search.best_params_
    except Exception as e:
        print(f"Tuning error: {e}")
        return model, None

# Step 7: Evaluate Models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    try:
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred = model.predict(X_test)
        train_accuracy = accuracy_score(y_train, model.predict(X_train)) * 100
        test_accuracy = accuracy_score(y_test, y_pred) * 100
        report = classification_report(y_test, y_pred)

        return train_accuracy, test_accuracy, report, training_time, y_pred
    except Exception as e:
        print(f"Evaluation error: {e}")
        return None, None, None, None, None

# Step 8: Train and Evaluate Models
for name, model in models.items():
    print(f"Evaluating {name}...")
    train_accuracy, test_accuracy, report, training_time, y_pred = evaluate_model(model, X_train, X_test, y_train, y_test)

    if train_accuracy is not None:
        print(f"{name} Training Accuracy: {train_accuracy:.2f}%")
        print(f"{name} Test Accuracy: {test_accuracy:.2f}%")
        print(f"Time: {training_time:.2f}s")
        print(report)

        fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.figure(figsize=(10, 6))
        plt.plot(fpr, tpr, lw=2, label=f'{name} ROC (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], linestyle='--')
        plt.title(f'ROC Curve - {name}')
        plt.legend()
        plt.show()

        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
        plt.title(f'Confusion Matrix - {name}')
        plt.show()

# Step 9: Stacking Ensemble
print("Evaluating Stacking Ensemble...")
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42))
]
meta_learner = GradientBoostingClassifier(random_state=42)
stack_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5, n_jobs=-1)

train_acc, test_acc, report, train_time, y_pred_stack = evaluate_model(stack_model, X_train, X_test, y_train, y_test)

if train_acc is not None:
    print(f"Stacking Training Accuracy: {train_acc:.2f}%")
    print(f"Stacking Test Accuracy: {test_acc:.2f}%")
    print(f"Execution Time: {train_time:.2f} seconds")
    print("Classification Report:")
    print(report)

    fpr, tpr, _ = roc_curve(y_test, stack_model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, lw=2, label=f'Stacking ROC (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.title('ROC Curve - Stacking Ensemble')
    plt.legend()
    plt.show()

    cm = confusion_matrix(y_test, y_pred_stack)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
    plt.title('Confusion Matrix - Stacking Ensemble')
    plt.show()