In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

In [2]:
data = pd.read_csv('E Commerce Dataset.csv')
data

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,160
1,50002,1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,121
2,50003,1,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120
3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134
4,50005,1,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5625,55626,0,10.0,Computer,1,30.0,Credit Card,Male,3.0,2,Laptop & Accessory,1,Married,6,0,18.0,1.0,2.0,4.0,151
5626,55627,0,13.0,Mobile Phone,1,13.0,Credit Card,Male,3.0,5,Fashion,5,Married,6,0,16.0,1.0,2.0,,225
5627,55628,0,1.0,Mobile Phone,1,11.0,Debit Card,Male,3.0,2,Laptop & Accessory,4,Married,3,1,21.0,1.0,2.0,4.0,186
5628,55629,0,23.0,Computer,3,9.0,Credit Card,Male,4.0,5,Laptop & Accessory,4,Married,4,0,15.0,2.0,2.0,9.0,179


In [3]:
data.describe(include='all')

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
count,5630.0,5630.0,5366.0,5630,5630.0,5379.0,5630,5630,5375.0,5630.0,5630,5630.0,5630,5630.0,5630.0,5365.0,5374.0,5372.0,5323.0,5630.0
unique,,,,3,,,7,2,,,6,,3,,,,,,,
top,,,,Mobile Phone,,,Debit Card,Male,,,Laptop & Accessory,,Married,,,,,,,
freq,,,,2765,,,2314,3384,,,2050,,2986,,,,,,,
mean,52815.5,0.168384,10.189899,,1.654707,15.639896,,,2.931535,3.688988,,3.066785,,4.214032,0.284902,15.707922,1.751023,3.008004,4.543491,177.221492
std,1625.385339,0.37424,8.557241,,0.915389,8.531475,,,0.721926,1.023999,,1.380194,,2.583586,0.451408,3.675485,1.894621,2.93968,3.654433,49.193869
min,50001.0,0.0,0.0,,1.0,5.0,,,0.0,1.0,,1.0,,1.0,0.0,11.0,0.0,1.0,0.0,0.0
25%,51408.25,0.0,2.0,,1.0,9.0,,,2.0,3.0,,2.0,,2.0,0.0,13.0,1.0,1.0,2.0,146.0
50%,52815.5,0.0,9.0,,1.0,14.0,,,3.0,4.0,,3.0,,3.0,0.0,15.0,1.0,2.0,3.0,163.0
75%,54222.75,0.0,16.0,,3.0,20.0,,,3.0,4.0,,4.0,,6.0,1.0,18.0,2.0,3.0,7.0,196.0


### Extracting the numerical and Categorical columns

#### Dropping 'CustomerID' from numerical columns as it's an identifier, not a feature

In [4]:
data = data.drop("CustomerID", axis=1)

In [5]:
y = data['Churn']
X = data.drop('Churn', axis=1)

### Identify categorical and numerical features

In [6]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

## Data Splitting (Training and Test set)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Creating preprocessing pipelines for numerical and categorical data

In [8]:
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

In [9]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Creating a preprocessor using ColumnTransformer

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Creating the full machine learning pipeline with SMOTE and the model

In [11]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

### Defining the Hyperparameter Grid

In [16]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7],
}

### Performing GridSearchCV

In [17]:
# cv=5 means 5-fold cross-validation
# n_jobs=-1 means use all available CPU cores to speed up the process
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, n_jobs=-1, scoring='f1')


# Fitting the grid search to the data
# This will take some time as it trains a model for every combination of parameters
grid_search.fit(X_train, y_train)

### Analyzing the results

In [18]:
print("Best Parameters found: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Parameters found:  {'classifier__learning_rate': 0.2, 'classifier__max_depth': 7, 'classifier__n_estimators': 300}
Best F1 Score:  0.8921820874480011


### Saving the new optimized model

In [19]:
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'gradient_boosting_model_pipeline.joblib')

['gradient_boosting_model_pipeline.joblib']

In [20]:
print("\nNew optimized model saved successfully as 'gradient_boosting_model_pipeline.joblib'")


New optimized model saved successfully as 'gradient_boosting_model_pipeline.joblib'


### Load the newly saved optimized model

In [21]:
optimized_model = joblib.load('gradient_boosting_model_pipeline.joblib')

In [22]:
y_pred = optimized_model.predict(X_test)

### Evaluationg the optimized model

In [23]:
print("--- Classification Report ---")
print(classification_report(y_test, y_pred))

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       936
           1       0.99      0.96      0.98       190

    accuracy                           0.99      1126
   macro avg       0.99      0.98      0.99      1126
weighted avg       0.99      0.99      0.99      1126



In [24]:
print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))


--- Confusion Matrix ---
[[934   2]
 [  7 183]]


### Getting the feature importances from the trained model

In [25]:
print("\n--- Feature Importances ---")
# Access the feature names and importances from the pipeline
onehot_feature_names = optimized_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)

feature_names = numerical_features.tolist() + list(onehot_feature_names)

importances = optimized_model.named_steps['classifier'].feature_importances_

feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
print(feature_importance_df.to_markdown(index=False))


--- Feature Importances ---
| feature                               |   importance |
|:--------------------------------------|-------------:|
| Tenure                                |  0.397432    |
| NumberOfAddress                       |  0.0700677   |
| Complain                              |  0.0614973   |
| SatisfactionScore                     |  0.0567269   |
| NumberOfDeviceRegistered              |  0.0438063   |
| WarehouseToHome                       |  0.0387102   |
| DaySinceLastOrder                     |  0.0373263   |
| CashbackAmount                        |  0.0296467   |
| PreferedOrderCat_Laptop & Accessory   |  0.0284426   |
| PreferedOrderCat_Fashion              |  0.0274684   |
| CityTier                              |  0.0256181   |
| OrderAmountHikeFromlastYear           |  0.0236015   |
| MaritalStatus_Single                  |  0.0222083   |
| MaritalStatus_Married                 |  0.0188132   |
| CouponUsed                            |  0.0161946   |
| 