In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score, average_precision_score

# 1) Load your exported sample
path = "gs://avazu-ctr/processed/fe_v1_sample/part-000000000000.csv"
df = pd.read_csv(path)

# 2) Define columns
drop_cols = ["click", "id", "date"]  # drop non-features + non-numeric date
num_cols = ["hour_of_day", "day_of_week", "is_weekend", "site_freq", "app_freq", "device_freq"]

# All remaining non-numeric columns (object dtype) are categorical
candidate_feature_cols = [c for c in df.columns if c not in drop_cols]
cat_cols = [c for c in candidate_feature_cols if df[c].dtype == "object"]
# Numeric columns include those we listed explicitly (ensure they exist)
num_cols = [c for c in num_cols if c in df.columns]

# 3) Split
X = df[cat_cols + num_cols]
y = df["click"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4) Preprocessing + Model
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(with_mean=False), num_cols),
    ],
    sparse_threshold=1.0  # keep sparse
)

clf = LogisticRegression(
    solver="saga",            # good for large, sparse one-hots
    max_iter=200,
    class_weight="balanced",
    n_jobs=-1,
    C=1.0,
    penalty="l2",
    random_state=42
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", clf),
])

# 5) Train
pipe.fit(X_train, y_train)

# 6) Evaluate
proba = pipe.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:", average_precision_score(y_test, proba))
print("LogLoss:", log_loss(y_test, proba))
print("Accuracy:", accuracy_score(y_test, pred))

# Optional: sanity checks
print("Positives in train/test:", y_train.mean(), y_test.mean())


AUC: 0.7228998731069742
PR-AUC: 0.33750028120750164
LogLoss: 0.5307046715801752
Accuracy: 0.7122319055565857
Positives in train/test: 0.1694588923192934 0.16945423079300329


#  Phase 2 — Baseline Logistic Regression (CTR)

**Data:** `gs://avazu-ctr/processed/fe_v1_sample/part-000000000000.csv`  
**Target:** `click` (binary)  
**Features used:**
- **Time-based:** `hour_of_day`, `day_of_week`, `is_weekend`
- **Count-based:** `site_freq`, `app_freq`, `device_freq`
- **Categorical IDs:** site/app/device/app/site identifiers (one-hot encoded)
- **Dropped:** `id`, `date`

**Preprocessing pipeline:**
- `OneHotEncoder(handle_unknown='ignore')` for categorical columns  
- `StandardScaler(with_mean=False)` for numeric columns  
- Combined with `ColumnTransformer` and trained in an `sklearn.Pipeline`

**Model:** `LogisticRegression(solver='saga', class_weight='balanced', max_iter=200, C=1.0, random_state=42)`

###  Metrics (Test Set)
- **ROC AUC:** 0.7229  
- **PR AUC:** 0.3375  
- **LogLoss:** 0.5307  
- **Accuracy:** 0.7122  
- **Positive rate (train/test):** 0.1695 / 0.1695

### Interpretation
- AUC ~0.72 is a **solid baseline** for sparse CTR data with simple features.
- PR-AUC ~0.34 reflects reasonable ranking ability given the ~17% positive rate.
- LogLoss ~0.53 indicates moderate calibration; expect improvements with richer features (target encoding, interactions, embeddings) or stronger models (GBDT, deep CTR).

### What’s next (Phase 2.5 / Phase 3)
- Try **hashing trick** or **count/target encodings** to reduce dimensionality.
- Compare against **XGBoost/LightGBM** and later **PyTorch embeddings** for high-cardinality categoricals.

*Notes:* Keep this notebook as the **baseline reference** to compare future models.
