In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras import models, layers

# 1. Load Datasets
df_purchase = pd.read_csv("User_product_purchase_details_p2.csv")
df_user = pd.read_csv("user_demographics.csv")

df = pd.merge(df_purchase, df_user, on="User_ID", how="left")

In [12]:
# 2. Create Target Column
df["High_Value_Purchase"] = (df["Purchase"] >= 10000).astype(int)

df = df.fillna(0)

In [13]:
#Drop unused column
if "Product_ID" in df.columns:
    df = df.drop("Product_ID", axis=1)

In [None]:
# 3. Encode Categorical Columns
le = LabelEncoder()

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = le.fit_transform(df[col].astype(str))

In [15]:
# 4. Train-Test Split
X = df.drop(["High_Value_Purchase", "Purchase"], axis=1)
y = df["High_Value_Purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
# 5. Scale Numeric Columns
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# 6. Logistic Regression
print("\n---------------- Logistic Regression ----------------")
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_scaled, y_train)
pred_lr = lr.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, pred_lr))
print(confusion_matrix(y_test, pred_lr))
print(classification_report(y_test, pred_lr))


---------------- Logistic Regression ----------------
Accuracy: 0.7667569582053193
[[64634  7472]
 [18188 19720]]
              precision    recall  f1-score   support

           0       0.78      0.90      0.83     72106
           1       0.73      0.52      0.61     37908

    accuracy                           0.77    110014
   macro avg       0.75      0.71      0.72    110014
weighted avg       0.76      0.77      0.76    110014



In [18]:
# 7. Random Forest
print("\n---------------- Random Forest ----------------")
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
pred_rf = rf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, pred_rf))
print(confusion_matrix(y_test, pred_rf))
print(classification_report(y_test, pred_rf))


---------------- Random Forest ----------------
Accuracy: 0.892150089988547
[[64926  7180]
 [ 4685 33223]]
              precision    recall  f1-score   support

           0       0.93      0.90      0.92     72106
           1       0.82      0.88      0.85     37908

    accuracy                           0.89    110014
   macro avg       0.88      0.89      0.88    110014
weighted avg       0.89      0.89      0.89    110014



In [19]:
# 8. XGBoost

print("\n---------------- XGBoost ----------------")
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    eval_metric="logloss"
)
xgb_model.fit(X_train_scaled, y_train)
pred_xgb = xgb_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, pred_xgb))
print(confusion_matrix(y_test, pred_xgb))
print(classification_report(y_test, pred_xgb))


---------------- XGBoost ----------------
Accuracy: 0.9022942534586507
[[63137  8969]
 [ 1780 36128]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92     72106
           1       0.80      0.95      0.87     37908

    accuracy                           0.90    110014
   macro avg       0.89      0.91      0.90    110014
weighted avg       0.91      0.90      0.90    110014



In [20]:
# 9. Keras MLP
print("\n---------------- Keras MLP ----------------")

input_dim = X_train_scaled.shape[1]

model = models.Sequential([
    layers.Dense(128, activation="relu", input_shape=(input_dim,)),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["precision"])

model.fit(X_train_scaled, y_train, epochs=10, batch_size=64, verbose=1)

pred_mlp_prob = model.predict(X_test_scaled).flatten()
pred_mlp = (pred_mlp_prob >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, pred_mlp))
print(confusion_matrix(y_test, pred_mlp))
print(classification_report(y_test, pred_mlp))


---------------- Keras MLP ----------------
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 991us/step - loss: 0.3474 - precision: 0.7838
Epoch 2/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 735us/step - loss: 0.2862 - precision: 0.7888
Epoch 3/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 774us/step - loss: 0.2734 - precision: 0.7890
Epoch 4/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 815us/step - loss: 0.2651 - precision: 0.7904
Epoch 5/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 805us/step - loss: 0.2607 - precision: 0.7910
Epoch 6/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 776us/step - loss: 0.2582 - precision: 0.7925
Epoch 7/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 816us/step - loss: 0.2564 - precision: 0.7934
Epoch 8/10
[1m6876/6876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 796us/step - loss: 0.2546 - precision: 0.7936
Epoch 9/10
