In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv(r"C:\Users\PAVAN TEJA\Downloads\projects\ecommerce-churn-prediction\data\processed\model_ready_dataset.csv")

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (3227, 37)


Unnamed: 0,churn,Recency,Frequency,Monetary,AvgOrderValue,CustomerLifetimeDays,AvgDaysBetweenPurchases,PurchaseRatePerMonth,WeekendPurchaseRatio,EveningPurchaseRatio,...,Recency_Frequency,Monetary_Frequency,Recency_Monetary,Log_Monetary,Log_Frequency,Log_Recency,High_Recency_Flag,Low_Frequency_Flag,Low_Monetary_Flag,Recency_Bucket
0,1,73,11,372.86,33.896364,196,19.6,1.460177,0.0,0.0,...,803,4101.46,27218.78,5.923881,2.484907,4.304065,0,0,1,2
1,0,133,1,794.52,794.52,0,0.0,1.0,0.0,0.0,...,133,794.52,105671.16,6.678996,0.693147,4.89784,1,1,0,3
2,1,111,1,437.11,437.11,0,0.0,1.0,0.0,0.0,...,111,437.11,48519.21,6.08247,0.693147,4.718499,1,1,0,3
3,0,94,2,382.13,191.065,181,181.0,0.28436,0.0,0.0,...,188,764.26,35920.22,5.948374,1.098612,4.553877,1,0,1,3
4,0,79,5,1236.48,247.296,198,49.5,0.657895,0.362319,0.0,...,395,6182.4,97681.92,7.120832,1.791759,4.382027,1,0,0,2


In [3]:
X = df.drop(columns=["churn"])
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=7,
    stratify=y
)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)


Train Shape: (2581, 36)
Test Shape: (646, 36)


In [4]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ),
    
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=3,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ),
    
    "Neural Network (MLP)": MLPClassifier(
        hidden_layer_sizes=(64,32),
        max_iter=500,
        random_state=42
    )
}


In [6]:
results = []

for name, model in models.items():
    
    # Use scaled data for Logistic & MLP
    if name in ["Logistic Regression", "Neural Network (MLP)"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    results.append([
        name,
        accuracy,
        precision,
        recall,
        f1,
        roc_auc
    ])

baseline_results = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]
)

baseline_results.sort_values(by="ROC-AUC", ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
0,Logistic Regression,0.653251,0.581395,0.641026,0.609756,0.726944
3,XGBoost,0.653251,0.587189,0.604396,0.595668,0.716112
2,Random Forest,0.651703,0.591603,0.567766,0.579439,0.712695
4,Neural Network (MLP),0.620743,0.551471,0.549451,0.550459,0.670241
1,Decision Tree,0.625387,0.551495,0.608059,0.578397,0.623064


In [7]:
baseline_results.sort_values(by="ROC-AUC", ascending=False).reset_index(drop=True)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
0,Logistic Regression,0.653251,0.581395,0.641026,0.609756,0.726944
1,XGBoost,0.653251,0.587189,0.604396,0.595668,0.716112
2,Random Forest,0.651703,0.591603,0.567766,0.579439,0.712695
3,Neural Network (MLP),0.620743,0.551471,0.549451,0.550459,0.670241
4,Decision Tree,0.625387,0.551495,0.608059,0.578397,0.623064


In [8]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV


In [9]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

log_param_grid = {
    "C": [0.01, 0.1, 1, 5, 10],
    "class_weight": [None, "balanced"]
}

log_grid = GridSearchCV(
    LogisticRegression(max_iter=3000),
    log_param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

log_grid.fit(X_train_scaled, y_train)

print("Best Logistic Params:", log_grid.best_params_)
print("Best Logistic CV ROC-AUC:", log_grid.best_score_)


Best Logistic Params: {'C': 0.01, 'class_weight': 'balanced'}
Best Logistic CV ROC-AUC: 0.7314699949509703


In [10]:
rf_param_grid = {
    "n_estimators": [300, 500],
    "max_depth": [5, 8, 12],
    "min_samples_split": [2, 5],
    "class_weight": [None, "balanced"]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print("Best RF Params:", rf_grid.best_params_)
print("Best RF CV ROC-AUC:", rf_grid.best_score_)


Best RF Params: {'class_weight': 'balanced', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 500}
Best RF CV ROC-AUC: 0.7265051837565479


In [11]:
xgb_param_grid = {
    "n_estimators": [300, 500],
    "learning_rate": [0.03, 0.05, 0.1],
    "max_depth": [2, 3, 4],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

xgb_grid = GridSearchCV(
    XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ),
    xgb_param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)

print("Best XGB Params:", xgb_grid.best_params_)
print("Best XGB CV ROC-AUC:", xgb_grid.best_score_)


Best XGB Params: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.8}
Best XGB CV ROC-AUC: 0.7201485872622031


In [12]:
best_log = log_grid.best_estimator_

best_log.fit(X_train_scaled, y_train)

log_prob = best_log.predict_proba(X_test_scaled)[:,1]
log_pred = best_log.predict(X_test_scaled)

print("Logistic Test ROC-AUC:", roc_auc_score(y_test, log_prob))
print("Precision:", precision_score(y_test, log_pred))
print("Recall:", recall_score(y_test, log_pred))
print("F1:", f1_score(y_test, log_pred))


Logistic Test ROC-AUC: 0.7302733013188778
Precision: 0.5694822888283378
Recall: 0.7655677655677655
F1: 0.653125


In [13]:
best_probs = y_prob  # from best model

for t in [0.35, 0.4, 0.45, 0.5]:
    preds = (best_probs >= t).astype(int)
    print("\nThreshold:", t)
    print("Precision:", precision_score(y_test, preds))
    print("Recall:", recall_score(y_test, preds))



Threshold: 0.35
Precision: 0.5507692307692308
Recall: 0.6556776556776557

Threshold: 0.4
Precision: 0.5540983606557377
Recall: 0.6190476190476191

Threshold: 0.45
Precision: 0.5567010309278351
Recall: 0.5934065934065934

Threshold: 0.5
Precision: 0.5514705882352942
Recall: 0.5494505494505495


In [14]:
best_rf = rf_grid.best_estimator_

best_rf.fit(X_train, y_train)

rf_prob = best_rf.predict_proba(X_test)[:,1]
rf_pred = best_rf.predict(X_test)

print("RF Test ROC-AUC:", roc_auc_score(y_test, rf_prob))
print("Precision:", precision_score(y_test, rf_pred))
print("Recall:", recall_score(y_test, rf_pred))
print("F1:", f1_score(y_test, rf_pred))


RF Test ROC-AUC: 0.7321097133429573
Precision: 0.5702479338842975
Recall: 0.7582417582417582
F1: 0.6509433962264151


In [15]:
best_xgb = xgb_grid.best_estimator_

best_xgb.fit(X_train, y_train)

xgb_prob = best_xgb.predict_proba(X_test)[:,1]
xgb_pred = best_xgb.predict(X_test)

print("XGB Test ROC-AUC:", roc_auc_score(y_test, xgb_prob))
print("Precision:", precision_score(y_test, xgb_pred))
print("Recall:", recall_score(y_test, xgb_pred))
print("F1:", f1_score(y_test, xgb_pred))


XGB Test ROC-AUC: 0.7318985750621139
Precision: 0.6040268456375839
Recall: 0.6593406593406593
F1: 0.6304728546409807


In [16]:
from sklearn.preprocessing import PolynomialFeatures

# Interaction-only (no squared explosion)
poly = PolynomialFeatures(
    degree=2,
    interaction_only=True,
    include_bias=False
)

X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

print("Original feature count:", X_train_scaled.shape[1])
print("New feature count:", X_train_poly.shape[1])


Original feature count: 36
New feature count: 666


In [17]:
elastic_log = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.5,
    C=0.1,
    max_iter=5000,
    class_weight='balanced'
)

elastic_log.fit(X_train_poly, y_train)

y_prob = elastic_log.predict_proba(X_test_poly)[:,1]
y_pred = elastic_log.predict(X_test_poly)

print("ElasticNet Logistic ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))


ElasticNet Logistic ROC-AUC: 0.7386991917823017
Precision: 0.5769230769230769
Recall: 0.8241758241758241


In [18]:
xgb_strong = XGBClassifier(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1]),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_strong.fit(X_train, y_train)

y_prob = xgb_strong.predict_proba(X_test)[:,1]
y_pred = xgb_strong.predict(X_test)

print("Strong XGB ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))


Strong XGB ROC-AUC: 0.7110548075695529
Precision: 0.5791139240506329
Recall: 0.6703296703296703


In [19]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

best_log = LogisticRegression(
    C=0.01,
    class_weight='balanced',
    max_iter=3000
)

rskf = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=5,
    random_state=42
)

cv_scores = cross_val_score(
    best_log,
    X_train_scaled,
    y_train,
    scoring='roc_auc',
    cv=rskf,
    n_jobs=-1
)

print("Repeated CV ROC-AUC Mean:", cv_scores.mean())
print("Repeated CV ROC-AUC Std:", cv_scores.std())


Repeated CV ROC-AUC Mean: 0.7309056823944394
Repeated CV ROC-AUC Std: 0.018298586991345577


In [20]:
# Final chosen model (baseline logistic)
final_model = LogisticRegression(
    max_iter=3000,
    random_state=42
)

# Train using scaled data
final_model.fit(X_train_scaled, y_train)

# Evaluate once more for confirmation
final_prob = final_model.predict_proba(X_test_scaled)[:,1]

print("Final Model Test ROC-AUC:",
      roc_auc_score(y_test, final_prob))


Final Model Test ROC-AUC: 0.7269441907511612


In [26]:
import joblib
import os

# Create directory if not exists
os.makedirs("../models", exist_ok=True)

# Save trained model
joblib.dump(final_model, r"C:\Users\PAVAN TEJA\Downloads\projects\ecommerce-churn-prediction\models\final_churn_model.pkl")

# Save scaler
joblib.dump(scaler, r"C:\Users\PAVAN TEJA\Downloads\projects\ecommerce-churn-prediction\models\scaler.pkl")

print("Model and scaler saved successfully.")


Model and scaler saved successfully.


In [28]:
feature_columns = X.columns.tolist()

joblib.dump(feature_columns, r"C:\Users\PAVAN TEJA\Downloads\projects\ecommerce-churn-prediction\models\feature_columns.pkl")

print("Feature list saved.")


Feature list saved.
