In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

In [None]:
import pandas as pd

df = pd.read_csv("D:\product_purchase_prediction\data\product_purchase_ - product_purchase_impure.csv.csv")

df_clean = df.dropna(subset=["Purchase"])

# Fill missing values in numerical columns with median
for col in ["TimeOnSite", "Age", "AdsClicked", "PreviousPurchases"]:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

df_clean["Gender"] = df_clean["Gender"].map({"Male": 1, "Female": 0})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
fig.suptitle("Feature Distributions by Purchase Outcome", fontsize=16)

features = ['TimeOnSite', 'Age', 'Gender', 'AdsClicked', 'PreviousPurchases']
for ax, feature in zip(axes.flatten(), features):
    sns.boxplot(x='Purchase', y=feature, data=df_clean, ax=ax, palette="Set2")
    ax.set_title(f'{feature} vs Purchase')

axes[2][1].axis('off')  # Hide unused subplot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Encode gender
#df['Gender'] = LabelEncoder().fit_transform(df['Gender'])  # M=1, F=0

X = df_clean.drop("Purchase", axis=1)
y = df_clean["Purchase"].astype(int)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression
log_model = LogisticRegression(max_iter=25)
log_model.fit(X_train, y_train)

# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)    
rf_model.fit(X_train, y_train)

In [None]:
models = {'Logistic Regression': log_model, 'Decision Tree': tree_model,'RandomForestClassifier': rf_model}

for name, model in models.items():
    print(f"\nModel: {name}")
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
with open("D:\product_purchase_prediction\model\logistic_model.pkl", "wb") as f:
    pickle.dump(log_model, f)

with open("D:\product_purchase_prediction\model\decision_tree_model.pkl", "wb") as f:
    pickle.dump(tree_model, f)

In [None]:
with open(r"D:\product_purchase_prediction\model\random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)