In [19]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("shriyashjagtap/e-commerce-customer-for-behavior-analysis")

print("Path to dataset files:", path)
df = pd.read_csv(path + "/ecommerce_customer_data_custom_ratios.csv")
print(df.head())

Path to dataset files: /root/.cache/kagglehub/datasets/shriyashjagtap/e-commerce-customer-for-behavior-analysis/versions/4
   Customer ID        Purchase Date Product Category  Product Price  Quantity  \
0        46251  2020-09-08 09:38:32      Electronics             12         3   
1        46251  2022-03-05 12:56:35             Home            468         4   
2        46251  2022-05-23 18:18:01             Home            288         2   
3        46251  2020-11-12 13:13:29         Clothing            196         1   
4        13593  2020-11-27 17:55:11             Home            449         1   

   Total Purchase Amount Payment Method  Customer Age  Returns  \
0                    740    Credit Card            37      0.0   
1                   2739         PayPal            37      0.0   
2                   3196         PayPal            37      0.0   
3                   3509         PayPal            37      0.0   
4                   3452    Credit Card            49      0

In [20]:
features_to_remove = ["Customer ID", "Purchase Date", "Product Price", "Total Purchase Amount","Customer Name", "Age"]
df = df.drop(columns=features_to_remove, errors="ignore")




In [21]:
print(df.head())

  Product Category  Quantity Payment Method  Customer Age  Returns  Gender  \
0      Electronics         3    Credit Card            37      0.0    Male   
1             Home         4         PayPal            37      0.0    Male   
2             Home         2         PayPal            37      0.0    Male   
3         Clothing         1         PayPal            37      0.0    Male   
4             Home         1    Credit Card            49      0.0  Female   

   Churn  
0      0  
1      0  
2      0  
3      0  
4      1  


In [22]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])


oheProduct = OneHotEncoder(sparse_output = False)
ohePayment = OneHotEncoder(sparse_output = False)

productCategoryEncoded = oheProduct.fit_transform(df[["Product Category"]])
paymentEncoded = ohePayment.fit_transform(df[["Payment Method"]])

productCatdf = pd.DataFrame(productCategoryEncoded, columns=oheProduct.get_feature_names_out(["Product Category"]))
paymentdf = pd.DataFrame(paymentEncoded, columns=ohePayment.get_feature_names_out(["Payment Method"]))




In [24]:
df= df.drop(columns = ["Product Category", "Payment Method"])
df = pd.concat([df,productCatdf, paymentdf], axis = 1)

In [25]:
print(df)

        Quantity  Customer Age  Returns  Gender  Churn  \
0              3            37      0.0       1      0   
1              4            37      0.0       1      0   
2              2            37      0.0       1      0   
3              1            37      0.0       1      0   
4              1            49      0.0       0      1   
...          ...           ...      ...     ...    ...   
249995         2            55      1.0       1      1   
249996         1            42      1.0       0      1   
249997         5            41      0.0       1      0   
249998         2            34      0.0       0      1   
249999         4            36      1.0       0      1   

        Product Category_Books  Product Category_Clothing  \
0                          0.0                        0.0   
1                          0.0                        0.0   
2                          0.0                        0.0   
3                          0.0                        1.0  

In [26]:
df["Returns"] = df["Returns"].fillna(0)


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


x = df.drop(columns=["Churn"])
y = df["Churn"]

xTrain, xTest, yTrain, yTest = train_test_split(x, y,test_size = 0.2, random_state = 42)

#print(xTrain.shape)


In [28]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(xTrain, yTrain)
yPred = model.predict(xTest)

accuracy = accuracy_score(yTest, yPred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("Classification Report:\n", classification_report(yTest, yPred))

Model Accuracy: 0.7905
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.98      0.88     40016
           1       0.21      0.02      0.03      9984

    accuracy                           0.79     50000
   macro avg       0.51      0.50      0.46     50000
weighted avg       0.68      0.79      0.71     50000



In [10]:
print(xTrain.columns)

Index(['Quantity', 'Customer Age', 'Returns', 'Gender',
       'Product Category_Books', 'Product Category_Clothing',
       'Product Category_Electronics', 'Product Category_Home',
       'Payment Method_Cash', 'Payment Method_Credit Card',
       'Payment Method_Crypto', 'Payment Method_PayPal'],
      dtype='object')


In [29]:
import joblib
from google.colab import files

joblib.dump(model, "random_forest_model.pkl")
files.download("random_forest_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>