In [1]:
# Import necessary libraries
import kagglehub
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Download latest version
path = kagglehub.dataset_download("shriyashjagtap/e-commerce-customer-for-behavior-analysis")
print("Path to dataset files:", path)

df = pd.read_csv(path + "/ecommerce_customer_data_custom_ratios.csv")
print(df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/shriyashjagtap/e-commerce-customer-for-behavior-analysis?dataset_version_number=4...


100%|██████████| 9.94M/9.94M [00:00<00:00, 26.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shriyashjagtap/e-commerce-customer-for-behavior-analysis/versions/4
   Customer ID        Purchase Date Product Category  Product Price  Quantity  \
0        46251  2020-09-08 09:38:32      Electronics             12         3   
1        46251  2022-03-05 12:56:35             Home            468         4   
2        46251  2022-05-23 18:18:01             Home            288         2   
3        46251  2020-11-12 13:13:29         Clothing            196         1   
4        13593  2020-11-27 17:55:11             Home            449         1   

   Total Purchase Amount Payment Method  Customer Age  Returns  \
0                    740    Credit Card            37      0.0   
1                   2739         PayPal            37      0.0   
2                   3196         PayPal            37      0.0   
3                   3509         PayPal            37      0.0   
4                   3452    Credit Card            49      0

In [3]:
# Drop unnecessary columns
features_to_remove = ["Customer ID", "Purchase Date", "Product Price", "Total Purchase Amount", "Customer Name", "Age"]
df = df.drop(columns=features_to_remove, errors="ignore")

In [4]:
# Encode categorical variables
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])

oheProduct = OneHotEncoder(sparse_output=False)
ohePayment = OneHotEncoder(sparse_output=False)

productCategoryEncoded = oheProduct.fit_transform(df[["Product Category"]])
paymentEncoded = ohePayment.fit_transform(df[["Payment Method"]])

productCatdf = pd.DataFrame(productCategoryEncoded, columns=oheProduct.get_feature_names_out(["Product Category"]))
paymentdf = pd.DataFrame(paymentEncoded, columns=ohePayment.get_feature_names_out(["Payment Method"]))

df = df.drop(columns=["Product Category", "Payment Method"])
df = pd.concat([df, productCatdf, paymentdf], axis=1)

In [5]:
# Handle missing values
df["Returns"] = df["Returns"].fillna(0)

In [8]:
# Prepare dataset for XGBoost
X = df.drop(columns=["Churn"])
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train XGBoost model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [10]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.8003
Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89     40016
           1       0.00      0.00      0.00      9984

    accuracy                           0.80     50000
   macro avg       0.40      0.50      0.44     50000
weighted avg       0.64      0.80      0.71     50000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import joblib
from google.colab import files

joblib.dump(model, "xgboost_model.pkl")
files.download("xgboost_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>