# Logistic Regression Baseline Training
## Customer Purchase Propensity Prediction

This notebook trains a Logistic Regression model following the plan:
1. Load data from Feast Feature Store (parquet file)
2. Preprocessing: StandardScaler for numerical, OneHotEncoder for categorical
3. Train/Val/Test split: 64%/16%/20%
4. Regularization tuning on validation set
5. Evaluate with Accuracy, Precision, Recall, F1, AUC-ROC
6. Save metrics to JSON

In [5]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from datetime import datetime
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

import warnings
warnings.filterwarnings("ignore")

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load Data from Feast Feature Store

In [6]:
print("=" * 60)
print("LOGISTIC REGRESSION BASELINE TRAINING")
print("=" * 60)

print("\n[1/6] Loading data from Feast Feature Store...")

# Define path - adjust based on notebook location
parquet_path = Path("../../../data_pipeline/propensity_feature_store/propensity_features/feature_repo/data/processed_purchase_propensity_data_v1.parquet")
parquet_path = parquet_path.resolve()

print(f"Loading from: {parquet_path}")
df = pd.read_parquet(parquet_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

LOGISTIC REGRESSION BASELINE TRAINING

[1/6] Loading data from Feast Feature Store...
Loading from: /Users/jky/Library/CloudStorage/GoogleDrive-lethanhquang094@gmail.com/My Drive/FPT/Semester_4/DAP391m/Cart-to-Purchase-Conversion-Prediction/data_pipeline/propensity_feature_store/propensity_features/feature_repo/data/processed_purchase_propensity_data_v1.parquet
Dataset shape: (2933439, 11)
Columns: ['user_id', 'product_id', 'event_timestamp', 'created_timestamp', 'category_code_level1', 'category_code_level2', 'brand', 'event_weekday', 'price', 'activity_count', 'is_purchased']


In [7]:
# Explore the data
print("First 5 rows:")
display(df.head())

print(f"\nData types:")
print(df.dtypes)

First 5 rows:


Unnamed: 0,user_id,product_id,event_timestamp,created_timestamp,category_code_level1,category_code_level2,brand,event_weekday,price,activity_count,is_purchased
0,515903856,2601552,2019-11-17 00:11:39,2026-01-18 22:17:22.150556,unknown,unknown,gorenje,6,486.24,6,0
1,516301799,12702930,2019-11-12 15:40:15,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,2,0
2,516301799,12702930,2019-11-12 15:41:46,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,6,0
3,516301799,12702930,2019-11-12 15:42:05,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,8,0
4,561066382,3800966,2019-11-15 23:36:25,2026-01-18 22:17:22.150556,appliances,iron,elenberg,4,20.57,2,0



Data types:
user_id                          int64
product_id                       int64
event_timestamp         datetime64[ns]
created_timestamp       datetime64[us]
category_code_level1            object
category_code_level2            object
brand                           object
event_weekday                    int64
price                          float64
activity_count                   int64
is_purchased                     int64
dtype: object


## 2. Define Features and Preprocessing

In [8]:
print("\n[2/6] Preparing features and preprocessing pipeline...")

NUMERICAL_FEATURES = ["price", "activity_count", "event_weekday"]
CATEGORICAL_FEATURES = ["brand", "category_code_level1", "category_code_level2"]
TARGET = "is_purchased"
ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

print(f"Numerical features: {NUMERICAL_FEATURES}")
print(f"Categorical features: {CATEGORICAL_FEATURES}")
print(f"Target: {TARGET}")


[2/6] Preparing features and preprocessing pipeline...
Numerical features: ['price', 'activity_count', 'event_weekday']
Categorical features: ['brand', 'category_code_level1', 'category_code_level2']
Target: is_purchased


In [9]:
# Prepare X and y
X = df[ALL_FEATURES].copy()
y = df[TARGET].copy()

# Convert categorical columns to string type
for col in CATEGORICAL_FEATURES:
    X[col] = X[col].astype(str)

print(f"\nTarget distribution:")
print(f"  Class 0 (Not Purchased): {(y == 0).sum():,} ({(y == 0).mean() * 100:.2f}%)")
print(f"  Class 1 (Purchased):     {(y == 1).sum():,} ({(y == 1).mean() * 100:.2f}%)")


Target distribution:
  Class 0 (Not Purchased): 2,170,105 (73.98%)
  Class 1 (Purchased):     763,334 (26.02%)


## 3. Train/Validation/Test Split (64%/16%/20%)

In [10]:
print("\n[3/6] Splitting data (64%/16%/20%)...")

# First split: 80% train+val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: 80% train, 20% val (of the 80% = 64% and 16% of total)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
)

print(f"Training set:   {X_train.shape[0]:,} samples ({X_train.shape[0] / len(X) * 100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0] / len(X) * 100:.1f}%)")
print(f"Test set:       {X_test.shape[0]:,} samples ({X_test.shape[0] / len(X) * 100:.1f}%)")


[3/6] Splitting data (64%/16%/20%)...
Training set:   1,877,400 samples (64.0%)
Validation set: 469,351 samples (16.0%)
Test set:       586,688 samples (20.0%)


In [11]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUMERICAL_FEATURES),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, max_categories=100
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

print("Preprocessor created!")
print(preprocessor)

Preprocessor created!
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['price', 'activity_count', 'event_weekday']),
                                ('cat',
                                 OneHotEncoder(handle_unknown='ignore',
                                               max_categories=100,
                                               sparse_output=False),
                                 ['brand', 'category_code_level1',
                                  'category_code_level2'])])


## 4. Regularization Tuning on Validation Set

In [12]:
print("\n[4/6] Tuning regularization parameter C...")
print("-" * 50)

C_VALUES = [0.001, 0.01, 0.1, 1, 10, 100]
tuning_results = []

for C in C_VALUES:
    start_time = time.time()
    print(f"\nTraining with C={C}...", end=" ", flush=True)

    # Create pipeline
    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            (
                "classifier",
                LogisticRegression(
                    C=C,
                    solver="lbfgs",
                    max_iter=1000,
                    class_weight="balanced",
                    random_state=42,
                    n_jobs=-1,
                ),
            ),
        ]
    )

    # Fit on training data
    pipeline.fit(X_train, y_train)

    # Predict on validation set
    y_val_pred = pipeline.predict(X_val)
    y_val_proba = pipeline.predict_proba(X_val)[:, 1]

    # Calculate metrics
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average="macro")
    val_auc = roc_auc_score(y_val, y_val_proba)

    elapsed = time.time() - start_time

    result = {
        "C": C,
        "accuracy": val_accuracy,
        "f1_macro": val_f1,
        "auc_roc": val_auc,
        "pipeline": pipeline,
    }
    tuning_results.append(result)

    print(f"Done ({elapsed:.1f}s)")
    print(f"  Accuracy: {val_accuracy:.4f} | F1: {val_f1:.4f} | AUC-ROC: {val_auc:.4f}")


[4/6] Tuning regularization parameter C...
--------------------------------------------------

Training with C=0.001... Done (4.7s)
  Accuracy: 0.5572 | F1: 0.5238 | AUC-ROC: 0.5786

Training with C=0.01... Done (4.6s)
  Accuracy: 0.5536 | F1: 0.5219 | AUC-ROC: 0.5798

Training with C=0.1... Done (5.2s)
  Accuracy: 0.5520 | F1: 0.5212 | AUC-ROC: 0.5800

Training with C=1... Done (5.1s)
  Accuracy: 0.5522 | F1: 0.5213 | AUC-ROC: 0.5800

Training with C=10... Done (4.6s)
  Accuracy: 0.5524 | F1: 0.5214 | AUC-ROC: 0.5800

Training with C=100... Done (5.3s)
  Accuracy: 0.5520 | F1: 0.5211 | AUC-ROC: 0.5800


In [13]:
# Select best model
best_result = max(tuning_results, key=lambda x: x["auc_roc"])
best_C = best_result["C"]
best_pipeline = best_result["pipeline"]

print("\n" + "-" * 50)
print(f"Best C: {best_C} (AUC-ROC: {best_result['auc_roc']:.4f})")

# Summary table
print("\nTuning Summary:")
tuning_df = pd.DataFrame([
    {"C": r["C"], "Accuracy": r["accuracy"], "F1 Macro": r["f1_macro"], "AUC-ROC": r["auc_roc"]}
    for r in tuning_results
])
display(tuning_df)


--------------------------------------------------
Best C: 100 (AUC-ROC: 0.5800)

Tuning Summary:


Unnamed: 0,C,Accuracy,F1 Macro,AUC-ROC
0,0.001,0.557208,0.52377,0.578627
1,0.01,0.553616,0.521938,0.579848
2,0.1,0.552043,0.521151,0.580002
3,1.0,0.552156,0.521264,0.580002
4,10.0,0.552395,0.521355,0.579962
5,100.0,0.551988,0.521118,0.580027


## 5. Final Training and Evaluation

In [14]:
print("\n[5/6] Final training on train+validation...")

# Combine train and validation
X_train_final = pd.concat([X_train, X_val], axis=0)
y_train_final = pd.concat([y_train, y_val], axis=0)

print(f"Final training set: {len(X_train_final):,} samples")

# Create final pipeline
final_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUMERICAL_FEATURES),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, max_categories=100
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

final_pipeline = Pipeline(
    [
        ("preprocessor", final_preprocessor),
        (
            "classifier",
            LogisticRegression(
                C=best_C,
                solver="lbfgs",
                max_iter=1000,
                class_weight="balanced",
                random_state=42,
                n_jobs=-1,
            ),
        ),
    ]
)

start_time = time.time()
final_pipeline.fit(X_train_final, y_train_final)
train_time = time.time() - start_time
print(f"Training complete ({train_time:.1f}s)")


[5/6] Final training on train+validation...
Final training set: 2,346,751 samples
Training complete (7.0s)


In [15]:
# Evaluate on test set
print("\nEvaluating on test set...")
y_test_pred = final_pipeline.predict(X_test)
y_test_proba = final_pipeline.predict_proba(X_test)[:, 1]

# Calculate all metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision_macro = precision_score(y_test, y_test_pred, average="macro")
test_recall_macro = recall_score(y_test, y_test_pred, average="macro")
test_f1_macro = f1_score(y_test, y_test_pred, average="macro")
test_auc_roc = roc_auc_score(y_test, y_test_proba)

# Per-class metrics
test_precision_per_class = precision_score(y_test, y_test_pred, average=None)
test_recall_per_class = recall_score(y_test, y_test_pred, average=None)
test_f1_per_class = f1_score(y_test, y_test_pred, average=None)

print("\n" + "=" * 50)
print("TEST SET RESULTS")
print("=" * 50)
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision_macro:.4f} (macro)")
print(f"Recall:    {test_recall_macro:.4f} (macro)")
print(f"F1-Score:  {test_f1_macro:.4f} (macro)")
print(f"AUC-ROC:   {test_auc_roc:.4f}")

print("\nPer-Class Metrics:")
print(f"  Class 0 (Not Purchased):")
print(f"    Precision: {test_precision_per_class[0]:.4f}")
print(f"    Recall:    {test_recall_per_class[0]:.4f}")
print(f"    F1-Score:  {test_f1_per_class[0]:.4f}")
print(f"  Class 1 (Purchased):")
print(f"    Precision: {test_precision_per_class[1]:.4f}")
print(f"    Recall:    {test_recall_per_class[1]:.4f}")
print(f"    F1-Score:  {test_f1_per_class[1]:.4f}")


Evaluating on test set...

TEST SET RESULTS
Accuracy:  0.5502
Precision: 0.5441 (macro)
Recall:    0.5572 (macro)
F1-Score:  0.5196 (macro)
AUC-ROC:   0.5770

Per-Class Metrics:
  Class 0 (Not Purchased):
    Precision: 0.7827
    Recall:    0.5427
    F1-Score:  0.6410
  Class 1 (Purchased):
    Precision: 0.3054
    Recall:    0.5717
    F1-Score:  0.3982


In [16]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(f"  [[TN={cm[0, 0]:,}  FP={cm[0, 1]:,}]")
print(f"   [FN={cm[1, 0]:,}  TP={cm[1, 1]:,}]]")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=["Not Purchased", "Purchased"]))

Confusion Matrix:
  [[TN=235,540  FP=198,481]
   [FN=65,385  TP=87,282]]

Classification Report:
               precision    recall  f1-score   support

Not Purchased       0.78      0.54      0.64    434021
    Purchased       0.31      0.57      0.40    152667

     accuracy                           0.55    586688
    macro avg       0.54      0.56      0.52    586688
 weighted avg       0.66      0.55      0.58    586688



In [17]:
# Get validation metrics for the best model
y_val_pred_best = best_pipeline.predict(X_val)
y_val_proba_best = best_pipeline.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred_best)
val_precision_macro = precision_score(y_val, y_val_pred_best, average="macro")
val_recall_macro = recall_score(y_val, y_val_pred_best, average="macro")
val_f1_macro = f1_score(y_val, y_val_pred_best, average="macro")
val_auc_roc = roc_auc_score(y_val, y_val_proba_best)

val_precision_per_class = precision_score(y_val, y_val_pred_best, average=None)
val_recall_per_class = recall_score(y_val, y_val_pred_best, average=None)
val_f1_per_class = f1_score(y_val, y_val_pred_best, average=None)

print("Validation Set Metrics (for comparison):")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision_macro:.4f} (macro)")
print(f"  Recall:    {val_recall_macro:.4f} (macro)")
print(f"  F1-Score:  {val_f1_macro:.4f} (macro)")
print(f"  AUC-ROC:   {val_auc_roc:.4f}")

Validation Set Metrics (for comparison):
  Accuracy:  0.5520
  Precision: 0.5453 (macro)
  Recall:    0.5587 (macro)
  F1-Score:  0.5211 (macro)
  AUC-ROC:   0.5800


## 6. Save Metrics to JSON

In [18]:
print("\n[6/6] Saving metrics to JSON...")

metrics = {
    "model": "LogisticRegression",
    "timestamp": datetime.now().isoformat(),
    "hyperparameters": {
        "best_C": best_C,
        "solver": "lbfgs",
        "max_iter": 1000,
        "class_weight": "balanced",
    },
    "data_split": {
        "train_size": int(len(X_train)),
        "val_size": int(len(X_val)),
        "test_size": int(len(X_test)),
        "train_val_size": int(len(X_train_final)),
        "total_size": int(len(X)),
    },
    "features": {
        "numerical": NUMERICAL_FEATURES,
        "categorical": CATEGORICAL_FEATURES,
        "preprocessing": {
            "numerical": "StandardScaler",
            "categorical": "OneHotEncoder (max_categories=100)",
        },
    },
    "regularization_tuning": [
        {
            "C": r["C"],
            "val_accuracy": round(r["accuracy"], 4),
            "val_f1_macro": round(r["f1_macro"], 4),
            "val_auc_roc": round(r["auc_roc"], 4),
        }
        for r in tuning_results
    ],
    "validation_metrics": {
        "accuracy": round(val_accuracy, 4),
        "precision": {
            "macro": round(val_precision_macro, 4),
            "class_0": round(float(val_precision_per_class[0]), 4),
            "class_1": round(float(val_precision_per_class[1]), 4),
        },
        "recall": {
            "macro": round(val_recall_macro, 4),
            "class_0": round(float(val_recall_per_class[0]), 4),
            "class_1": round(float(val_recall_per_class[1]), 4),
        },
        "f1": {
            "macro": round(val_f1_macro, 4),
            "class_0": round(float(val_f1_per_class[0]), 4),
            "class_1": round(float(val_f1_per_class[1]), 4),
        },
        "auc_roc": round(val_auc_roc, 4),
    },
    "test_metrics": {
        "accuracy": round(test_accuracy, 4),
        "precision": {
            "macro": round(test_precision_macro, 4),
            "class_0": round(float(test_precision_per_class[0]), 4),
            "class_1": round(float(test_precision_per_class[1]), 4),
        },
        "recall": {
            "macro": round(test_recall_macro, 4),
            "class_0": round(float(test_recall_per_class[0]), 4),
            "class_1": round(float(test_recall_per_class[1]), 4),
        },
        "f1": {
            "macro": round(test_f1_macro, 4),
            "class_0": round(float(test_f1_per_class[0]), 4),
            "class_1": round(float(test_f1_per_class[1]), 4),
        },
        "auc_roc": round(test_auc_roc, 4),
    },
    "confusion_matrix": {
        "true_negative": int(cm[0, 0]),
        "false_positive": int(cm[0, 1]),
        "false_negative": int(cm[1, 0]),
        "true_positive": int(cm[1, 1]),
    },
}

metrics_path = Path("../../metrics/logistic_regression_metrics.json")
metrics_path = metrics_path.resolve()
metrics_path.parent.mkdir(parents=True, exist_ok=True)

with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics saved to: {metrics_path}")
print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)


[6/6] Saving metrics to JSON...
Metrics saved to: /Users/jky/Library/CloudStorage/GoogleDrive-lethanhquang094@gmail.com/My Drive/FPT/Semester_4/DAP391m/Cart-to-Purchase-Conversion-Prediction/model_pipeline/metrics/logistic_regression_metrics.json

TRAINING COMPLETE!


In [19]:
# Display final metrics
print("\nFinal Metrics Summary:")
print(json.dumps(metrics, indent=2))


Final Metrics Summary:
{
  "model": "LogisticRegression",
  "timestamp": "2026-01-21T10:15:43.345103",
  "hyperparameters": {
    "best_C": 100,
    "solver": "lbfgs",
    "max_iter": 1000,
    "class_weight": "balanced"
  },
  "data_split": {
    "train_size": 1877400,
    "val_size": 469351,
    "test_size": 586688,
    "train_val_size": 2346751,
    "total_size": 2933439
  },
  "features": {
    "numerical": [
      "price",
      "activity_count",
      "event_weekday"
    ],
    "categorical": [
      "brand",
      "category_code_level1",
      "category_code_level2"
    ],
    "preprocessing": {
      "numerical": "StandardScaler",
      "categorical": "OneHotEncoder (max_categories=100)"
    }
  },
  "regularization_tuning": [
    {
      "C": 0.001,
      "val_accuracy": 0.5572,
      "val_f1_macro": 0.5238,
      "val_auc_roc": 0.5786
    },
    {
      "C": 0.01,
      "val_accuracy": 0.5536,
      "val_f1_macro": 0.5219,
      "val_auc_roc": 0.5798
    },
    {
      "C": 