In [2]:
import pandas as pd
import numpy as np

# -----------------------------
# 1. Data Preparation
# -----------------------------
# Load datasets
df_purchase = pd.read_csv("User_product_purchase_details_p2.csv")
df_user = pd.read_csv("user_demographics.csv")

# Merge on User_ID
df = pd.merge(df_purchase, df_user, on="User_ID", how="left")

# Create binary target
df["High_Value_Purchase"] = (df["Purchase"] >= 10000).astype(int)

# Drop unnecessary columns
df = df.drop(["Product_ID"], axis=1)

# Handle missing values
df = df.fillna(0)

# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=["Gender", "Age", "City_Category", "Stay_In_Current_City_Years"], drop_first=True)

# Features and target
X = df_encoded.drop(["High_Value_Purchase", "Purchase", "User_ID"], axis=1)
y = df_encoded["High_Value_Purchase"]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 2. Logistic Regression Baseline
# -----------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

log = LogisticRegression(max_iter=2000)
log.fit(X_train_scaled, y_train)
pred_lr = log.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_lr))

# -----------------------------
# 3. MLP Classifier (Keras)
# -----------------------------
from tensorflow.keras import models, layers
from sklearn.metrics import precision_score

model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

history = model.fit(X_train_scaled, y_train, epochs=15, batch_size=32, validation_split=0.2, verbose=0)

loss, acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print("MLP Accuracy:", acc)

# -----------------------------
# 4. Compare Models
# -----------------------------
print("\nModel Comparison:")
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print("MLP Accuracy:", acc)

#precision calculation now
# Predictions for MLP (convert probabilities to class labels)
pred_mlp = (model.predict(X_test_scaled) > 0.5).astype(int).ravel()

# Precision scores
precision_lr = precision_score(y_test, pred_lr)
precision_mlp = precision_score(y_test, pred_mlp)

print("Precision (Logistic Regression):", precision_lr)
print("Precision (MLP):", precision_mlp)



Logistic Regression Accuracy: 0.7661752140636646
Confusion Matrix:
 [[64702  7404]
 [18320 19588]]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


MLP Accuracy: 0.8960950374603271

Model Comparison:
Logistic Regression Accuracy: 0.7661752140636646
MLP Accuracy: 0.8960950374603271
[1m3438/3438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 456us/step
Precision (Logistic Regression): 0.725696502667457
Precision (MLP): 0.7881254488867608
