# Logistic Regression V2 Training
## With Enriched Features (User, Product, Brand aggregates)

This notebook trains Logistic Regression with the new V2 features:
- User behavior features (views, carts, purchases, conversion rates)
- Product features (popularity, conversion rates)
- Brand features (purchase rate)
- Price comparison features

In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from datetime import datetime
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

import warnings
warnings.filterwarnings("ignore")

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Load V2 Data with Enriched Features

In [2]:
print("=" * 70)
print("LOGISTIC REGRESSION V2 TRAINING (With Enriched Features)")
print("=" * 70)

print("\n[1/6] Loading V2 data with enriched features...")

# Define path - adjust based on notebook location
parquet_path = Path("../../../data_pipeline/propensity_feature_store/propensity_features/feature_repo/data/processed_purchase_propensity_data_v2.parquet")
parquet_path = parquet_path.resolve()

print(f"Loading from: {parquet_path}")
df = pd.read_parquet(parquet_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

LOGISTIC REGRESSION V2 TRAINING (With Enriched Features)

[1/6] Loading V2 data with enriched features...
Loading from: /Users/jky/Library/CloudStorage/GoogleDrive-lethanhquang094@gmail.com/My Drive/FPT/Semester_4/DAP391m/Cart-to-Purchase-Conversion-Prediction/data_pipeline/propensity_feature_store/propensity_features/feature_repo/data/processed_purchase_propensity_data_v2.parquet
Dataset shape: (2929997, 31)
Columns: ['user_id', 'product_id', 'event_timestamp', 'created_timestamp', 'category_code_level1', 'category_code_level2', 'brand', 'event_weekday', 'price', 'activity_count', 'event_hour', 'user_total_events', 'user_total_views', 'user_total_carts', 'user_total_purchases', 'user_view_to_cart_rate', 'user_cart_to_purchase_rate', 'user_avg_purchase_price', 'user_unique_products', 'user_unique_categories', 'product_total_events', 'product_total_views', 'product_total_carts', 'product_total_purchases', 'product_view_to_cart_rate', 'product_cart_to_purchase_rate', 'product_unique_buye

In [3]:
# Explore the data
print("First 5 rows:")
display(df.head())

print(f"\nData types:")
print(df.dtypes)

First 5 rows:


Unnamed: 0,user_id,product_id,event_timestamp,created_timestamp,category_code_level1,category_code_level2,brand,event_weekday,price,activity_count,...,product_total_views,product_total_carts,product_total_purchases,product_view_to_cart_rate,product_cart_to_purchase_rate,product_unique_buyers,brand_purchase_rate,price_vs_user_avg,price_vs_category_avg,is_purchased
0,94566147,1005007,2019-11-12 15:04:08,2026-01-20 15:40:18.521933,electronics,smartphone,xiaomi,1,93.78,3,...,26505,2589,654,0.09768,0.252607,548,0.262642,1.0,0.221107,0
1,176495092,6301929,2019-11-08 14:01:42,2026-01-20 15:40:18.521933,appliances,kitchen,polaris,4,28.31,3,...,144,3,1,0.020833,0.333333,1,0.303095,1.0,0.120976,0
2,239198635,1003942,2019-11-09 15:29:59,2026-01-20 15:40:18.521933,electronics,smartphone,xiaomi,5,187.24,3,...,6618,84,22,0.012693,0.261905,19,0.262642,1.0,0.441459,0
3,239198635,1003942,2019-11-09 15:30:54,2026-01-20 15:40:18.521933,electronics,smartphone,xiaomi,5,187.24,5,...,6618,84,22,0.012693,0.261905,19,0.262642,1.0,0.441459,0
4,269003139,6000032,2019-11-26 14:38:48,2026-01-20 15:40:18.521933,auto,accessories,cenmax,1,66.39,12,...,9319,291,84,0.031227,0.28866,72,0.291525,1.0,0.443217,0



Data types:
user_id                                   int64
product_id                                int64
event_timestamp                  datetime64[ns]
created_timestamp                datetime64[us]
category_code_level1                     object
category_code_level2                     object
brand                                    object
event_weekday                             int64
price                                   float64
activity_count                            int64
event_hour                                int64
user_total_events                         int64
user_total_views                          int64
user_total_carts                          int64
user_total_purchases                      int64
user_view_to_cart_rate                  float64
user_cart_to_purchase_rate              float64
user_avg_purchase_price                 float64
user_unique_products                      int64
user_unique_categories                    int64
product_total_events       

## 2. Define Features (V2 - Extended)

In [4]:
print("\n[2/6] Preparing features...")

# Numerical features (original + new)
NUMERICAL_FEATURES = [
    # Original
    "price",
    "activity_count",
    "event_weekday",
    # New: Hour
    "event_hour",
    # New: User features
    "user_total_events",
    "user_total_views",
    "user_total_carts",
    "user_total_purchases",
    "user_view_to_cart_rate",
    "user_cart_to_purchase_rate",
    "user_avg_purchase_price",
    "user_unique_products",
    "user_unique_categories",
    # New: Product features
    "product_total_events",
    "product_total_views",
    "product_total_carts",
    "product_total_purchases",
    "product_view_to_cart_rate",
    "product_cart_to_purchase_rate",
    "product_unique_buyers",
    # New: Brand & Price comparison
    "brand_purchase_rate",
    "price_vs_user_avg",
    "price_vs_category_avg",
]

CATEGORICAL_FEATURES = ["brand", "category_code_level1", "category_code_level2"]
TARGET = "is_purchased"
ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

print(f"Numerical features: {len(NUMERICAL_FEATURES)}")
print(f"Categorical features: {len(CATEGORICAL_FEATURES)}")
print(f"Total features: {len(ALL_FEATURES)}")

print("\nNumerical features list:")
for i, f in enumerate(NUMERICAL_FEATURES, 1):
    print(f"  {i}. {f}")


[2/6] Preparing features...
Numerical features: 23
Categorical features: 3
Total features: 26

Numerical features list:
  1. price
  2. activity_count
  3. event_weekday
  4. event_hour
  5. user_total_events
  6. user_total_views
  7. user_total_carts
  8. user_total_purchases
  9. user_view_to_cart_rate
  10. user_cart_to_purchase_rate
  11. user_avg_purchase_price
  12. user_unique_products
  13. user_unique_categories
  14. product_total_events
  15. product_total_views
  16. product_total_carts
  17. product_total_purchases
  18. product_view_to_cart_rate
  19. product_cart_to_purchase_rate
  20. product_unique_buyers
  21. brand_purchase_rate
  22. price_vs_user_avg
  23. price_vs_category_avg


In [5]:
# Prepare X and y
X = df[ALL_FEATURES].copy()
y = df[TARGET].copy()

# Convert categorical columns to string type
for col in CATEGORICAL_FEATURES:
    X[col] = X[col].astype(str)

# Fill any remaining nulls
X = X.fillna(0)

print(f"\nTarget distribution:")
print(f"  Class 0 (Not Purchased): {(y == 0).sum():,} ({(y == 0).mean() * 100:.2f}%)")
print(f"  Class 1 (Purchased):     {(y == 1).sum():,} ({(y == 1).mean() * 100:.2f}%)")


Target distribution:
  Class 0 (Not Purchased): 2,170,105 (74.07%)
  Class 1 (Purchased):     759,892 (25.93%)


## 3. Train/Validation/Test Split (64%/16%/20%)

In [6]:
print("\n[3/6] Splitting data (64%/16%/20%)...")

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
)

print(f"Training set:   {X_train.shape[0]:,} samples ({X_train.shape[0] / len(X) * 100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0] / len(X) * 100:.1f}%)")
print(f"Test set:       {X_test.shape[0]:,} samples ({X_test.shape[0] / len(X) * 100:.1f}%)")


[3/6] Splitting data (64%/16%/20%)...
Training set:   1,875,197 samples (64.0%)
Validation set: 468,800 samples (16.0%)
Test set:       586,000 samples (20.0%)


In [7]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUMERICAL_FEATURES),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, max_categories=100
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

print("Preprocessor created!")

Preprocessor created!


## 4. Regularization Tuning on Validation Set

In [8]:
print("\n[4/6] Tuning regularization parameter C...")
print("-" * 50)

C_VALUES = [0.001, 0.01, 0.1, 1, 10, 100]
tuning_results = []

for C in C_VALUES:
    start_time = time.time()
    print(f"\nTraining with C={C}...", end=" ", flush=True)

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            (
                "classifier",
                LogisticRegression(
                    C=C,
                    solver="lbfgs",
                    max_iter=1000,
                    class_weight="balanced",
                    random_state=42,
                    n_jobs=-1,
                ),
            ),
        ]
    )

    pipeline.fit(X_train, y_train)

    y_val_pred = pipeline.predict(X_val)
    y_val_proba = pipeline.predict_proba(X_val)[:, 1]

    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average="macro")
    val_auc = roc_auc_score(y_val, y_val_proba)

    elapsed = time.time() - start_time

    result = {
        "C": C,
        "accuracy": val_accuracy,
        "f1_macro": val_f1,
        "auc_roc": val_auc,
        "pipeline": pipeline,
    }
    tuning_results.append(result)

    print(f"Done ({elapsed:.1f}s)")
    print(f"  Accuracy: {val_accuracy:.4f} | F1: {val_f1:.4f} | AUC-ROC: {val_auc:.4f}")


[4/6] Tuning regularization parameter C...
--------------------------------------------------

Training with C=0.001... Done (7.0s)
  Accuracy: 0.8109 | F1: 0.7751 | AUC-ROC: 0.8991

Training with C=0.01... Done (7.6s)
  Accuracy: 0.8112 | F1: 0.7754 | AUC-ROC: 0.8992

Training with C=0.1... Done (7.3s)
  Accuracy: 0.8112 | F1: 0.7754 | AUC-ROC: 0.8992

Training with C=1... Done (6.4s)
  Accuracy: 0.8112 | F1: 0.7754 | AUC-ROC: 0.8992

Training with C=10... Done (7.1s)
  Accuracy: 0.8112 | F1: 0.7754 | AUC-ROC: 0.8992

Training with C=100... Done (6.4s)
  Accuracy: 0.8112 | F1: 0.7754 | AUC-ROC: 0.8992


In [9]:
# Select best model
best_result = max(tuning_results, key=lambda x: x["auc_roc"])
best_C = best_result["C"]
best_pipeline = best_result["pipeline"]

print("\n" + "-" * 50)
print(f"Best C: {best_C} (AUC-ROC: {best_result['auc_roc']:.4f})")

# Summary table
print("\nTuning Summary:")
tuning_df = pd.DataFrame([
    {"C": r["C"], "Accuracy": r["accuracy"], "F1 Macro": r["f1_macro"], "AUC-ROC": r["auc_roc"]}
    for r in tuning_results
])
display(tuning_df)


--------------------------------------------------
Best C: 100 (AUC-ROC: 0.8992)

Tuning Summary:


Unnamed: 0,C,Accuracy,F1 Macro,AUC-ROC
0,0.001,0.810907,0.775121,0.899068
1,0.01,0.81116,0.77541,0.899182
2,0.1,0.811156,0.775363,0.899197
3,1.0,0.81116,0.77537,0.899195
4,10.0,0.81116,0.775394,0.899182
5,100.0,0.811184,0.77541,0.899212


## 5. Final Training and Evaluation

In [10]:
print("\n[5/6] Final training on train+validation...")

X_train_final = pd.concat([X_train, X_val], axis=0)
y_train_final = pd.concat([y_train, y_val], axis=0)

print(f"Final training set: {len(X_train_final):,} samples")

final_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUMERICAL_FEATURES),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, max_categories=100
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

final_pipeline = Pipeline(
    [
        ("preprocessor", final_preprocessor),
        (
            "classifier",
            LogisticRegression(
                C=best_C,
                solver="lbfgs",
                max_iter=1000,
                class_weight="balanced",
                random_state=42,
                n_jobs=-1,
            ),
        ),
    ]
)

start_time = time.time()
final_pipeline.fit(X_train_final, y_train_final)
train_time = time.time() - start_time
print(f"Training complete ({train_time:.1f}s)")


[5/6] Final training on train+validation...
Final training set: 2,343,997 samples
Training complete (10.4s)


In [11]:
# Evaluate on test set
print("\nEvaluating on test set...")
y_test_pred = final_pipeline.predict(X_test)
y_test_proba = final_pipeline.predict_proba(X_test)[:, 1]

# Calculate all metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision_macro = precision_score(y_test, y_test_pred, average="macro")
test_recall_macro = recall_score(y_test, y_test_pred, average="macro")
test_f1_macro = f1_score(y_test, y_test_pred, average="macro")
test_auc_roc = roc_auc_score(y_test, y_test_proba)

# Per-class metrics
test_precision_per_class = precision_score(y_test, y_test_pred, average=None)
test_recall_per_class = recall_score(y_test, y_test_pred, average=None)
test_f1_per_class = f1_score(y_test, y_test_pred, average=None)

print("\n" + "=" * 50)
print("TEST SET RESULTS (V2 Features)")
print("=" * 50)
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision_macro:.4f} (macro)")
print(f"Recall:    {test_recall_macro:.4f} (macro)")
print(f"F1-Score:  {test_f1_macro:.4f} (macro)")
print(f"AUC-ROC:   {test_auc_roc:.4f}")

print("\nPer-Class Metrics:")
print(f"  Class 0 (Not Purchased):")
print(f"    Precision: {test_precision_per_class[0]:.4f}")
print(f"    Recall:    {test_recall_per_class[0]:.4f}")
print(f"    F1-Score:  {test_f1_per_class[0]:.4f}")
print(f"  Class 1 (Purchased):")
print(f"    Precision: {test_precision_per_class[1]:.4f}")
print(f"    Recall:    {test_recall_per_class[1]:.4f}")
print(f"    F1-Score:  {test_f1_per_class[1]:.4f}")


Evaluating on test set...

TEST SET RESULTS (V2 Features)
Accuracy:  0.8116
Precision: 0.7616 (macro)
Recall:    0.8062 (macro)
F1-Score:  0.7759 (macro)
AUC-ROC:   0.8996

Per-Class Metrics:
  Class 0 (Not Purchased):
    Precision: 0.9193
    Recall:    0.8174
    F1-Score:  0.8654
  Class 1 (Purchased):
    Precision: 0.6039
    Recall:    0.7951
    F1-Score:  0.6864


In [12]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(f"  [[TN={cm[0, 0]:,}  FP={cm[0, 1]:,}]")
print(f"   [FN={cm[1, 0]:,}  TP={cm[1, 1]:,}]]")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=["Not Purchased", "Purchased"]))

Confusion Matrix:
  [[TN=354,762  FP=79,259]
   [FN=31,143  TP=120,836]]

Classification Report:
               precision    recall  f1-score   support

Not Purchased       0.92      0.82      0.87    434021
    Purchased       0.60      0.80      0.69    151979

     accuracy                           0.81    586000
    macro avg       0.76      0.81      0.78    586000
 weighted avg       0.84      0.81      0.82    586000



In [13]:
# Get validation metrics
y_val_pred_best = best_pipeline.predict(X_val)
y_val_proba_best = best_pipeline.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred_best)
val_precision_macro = precision_score(y_val, y_val_pred_best, average="macro")
val_recall_macro = recall_score(y_val, y_val_pred_best, average="macro")
val_f1_macro = f1_score(y_val, y_val_pred_best, average="macro")
val_auc_roc = roc_auc_score(y_val, y_val_proba_best)

val_precision_per_class = precision_score(y_val, y_val_pred_best, average=None)
val_recall_per_class = recall_score(y_val, y_val_pred_best, average=None)
val_f1_per_class = f1_score(y_val, y_val_pred_best, average=None)

print("Validation Set Metrics (for comparison):")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision_macro:.4f} (macro)")
print(f"  Recall:    {val_recall_macro:.4f} (macro)")
print(f"  F1-Score:  {val_f1_macro:.4f} (macro)")
print(f"  AUC-ROC:   {val_auc_roc:.4f}")

Validation Set Metrics (for comparison):
  Accuracy:  0.8112
  Precision: 0.7611 (macro)
  Recall:    0.8057 (macro)
  F1-Score:  0.7754 (macro)
  AUC-ROC:   0.8992


## 6. Save Metrics to JSON

In [14]:
print("\n[6/6] Saving metrics to JSON...")

metrics = {
    "model": "LogisticRegression",
    "version": "v2_enriched_features",
    "timestamp": datetime.now().isoformat(),
    "hyperparameters": {
        "best_C": best_C,
        "solver": "lbfgs",
        "max_iter": 1000,
        "class_weight": "balanced",
    },
    "data_split": {
        "train_size": int(len(X_train)),
        "val_size": int(len(X_val)),
        "test_size": int(len(X_test)),
        "train_val_size": int(len(X_train_final)),
        "total_size": int(len(X)),
    },
    "features": {
        "numerical": NUMERICAL_FEATURES,
        "categorical": CATEGORICAL_FEATURES,
        "total_count": len(ALL_FEATURES),
        "preprocessing": {
            "numerical": "StandardScaler",
            "categorical": "OneHotEncoder (max_categories=100)",
        },
    },
    "regularization_tuning": [
        {
            "C": r["C"],
            "val_accuracy": round(r["accuracy"], 4),
            "val_f1_macro": round(r["f1_macro"], 4),
            "val_auc_roc": round(r["auc_roc"], 4),
        }
        for r in tuning_results
    ],
    "validation_metrics": {
        "accuracy": round(val_accuracy, 4),
        "precision": {
            "macro": round(val_precision_macro, 4),
            "class_0": round(float(val_precision_per_class[0]), 4),
            "class_1": round(float(val_precision_per_class[1]), 4),
        },
        "recall": {
            "macro": round(val_recall_macro, 4),
            "class_0": round(float(val_recall_per_class[0]), 4),
            "class_1": round(float(val_recall_per_class[1]), 4),
        },
        "f1": {
            "macro": round(val_f1_macro, 4),
            "class_0": round(float(val_f1_per_class[0]), 4),
            "class_1": round(float(val_f1_per_class[1]), 4),
        },
        "auc_roc": round(val_auc_roc, 4),
    },
    "test_metrics": {
        "accuracy": round(test_accuracy, 4),
        "precision": {
            "macro": round(test_precision_macro, 4),
            "class_0": round(float(test_precision_per_class[0]), 4),
            "class_1": round(float(test_precision_per_class[1]), 4),
        },
        "recall": {
            "macro": round(test_recall_macro, 4),
            "class_0": round(float(test_recall_per_class[0]), 4),
            "class_1": round(float(test_recall_per_class[1]), 4),
        },
        "f1": {
            "macro": round(test_f1_macro, 4),
            "class_0": round(float(test_f1_per_class[0]), 4),
            "class_1": round(float(test_f1_per_class[1]), 4),
        },
        "auc_roc": round(test_auc_roc, 4),
    },
    "confusion_matrix": {
        "true_negative": int(cm[0, 0]),
        "false_positive": int(cm[0, 1]),
        "false_negative": int(cm[1, 0]),
        "true_positive": int(cm[1, 1]),
    },
}

metrics_path = Path("../../metrics/logistic_regression_v2_metrics.json")
metrics_path = metrics_path.resolve()
metrics_path.parent.mkdir(parents=True, exist_ok=True)

with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics saved to: {metrics_path}")
print("\n" + "=" * 70)
print("TRAINING COMPLETE!")
print("=" * 70)


[6/6] Saving metrics to JSON...
Metrics saved to: /Users/jky/Library/CloudStorage/GoogleDrive-lethanhquang094@gmail.com/My Drive/FPT/Semester_4/DAP391m/Cart-to-Purchase-Conversion-Prediction/model_pipeline/metrics/logistic_regression_v2_metrics.json

TRAINING COMPLETE!


## Comparison: V1 vs V2

In [15]:
# Compare with V1 baseline if available
v1_metrics_path = Path("../../metrics/logistic_regression_metrics.json")

if v1_metrics_path.exists():
    with open(v1_metrics_path, 'r') as f:
        v1_metrics = json.load(f)
    
    print("\n" + "=" * 60)
    print("COMPARISON: V1 (Baseline) vs V2 (Enriched Features)")
    print("=" * 60)
    
    comparison_data = {
        "Metric": ["Accuracy", "Precision (macro)", "Recall (macro)", "F1-Score (macro)", "AUC-ROC"],
        "V1 Baseline": [
            v1_metrics["test_metrics"]["accuracy"],
            v1_metrics["test_metrics"]["precision"]["macro"],
            v1_metrics["test_metrics"]["recall"]["macro"],
            v1_metrics["test_metrics"]["f1"]["macro"],
            v1_metrics["test_metrics"]["auc_roc"]
        ],
        "V2 Enriched": [
            metrics["test_metrics"]["accuracy"],
            metrics["test_metrics"]["precision"]["macro"],
            metrics["test_metrics"]["recall"]["macro"],
            metrics["test_metrics"]["f1"]["macro"],
            metrics["test_metrics"]["auc_roc"]
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df["Improvement"] = comparison_df["V2 Enriched"] - comparison_df["V1 Baseline"]
    comparison_df["Improvement %"] = (comparison_df["Improvement"] / comparison_df["V1 Baseline"] * 100).round(2)
    
    display(comparison_df)
else:
    print("V1 metrics not found. Run the baseline notebook first to compare.")


COMPARISON: V1 (Baseline) vs V2 (Enriched Features)


Unnamed: 0,Metric,V1 Baseline,V2 Enriched,Improvement,Improvement %
0,Accuracy,0.5502,0.8116,0.2614,47.51
1,Precision (macro),0.5441,0.7616,0.2175,39.97
2,Recall (macro),0.5572,0.8062,0.249,44.69
3,F1-Score (macro),0.5196,0.7759,0.2563,49.33
4,AUC-ROC,0.577,0.8996,0.3226,55.91
