In [1]:
import time, pandas as pd
import pyarrow.dataset as ds
from pyarrow.fs import GcsFileSystem, FileSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score, average_precision_score
import xgboost as xgb
import joblib

BUCKET_DIR = "avazu-ctr/processed/fe_v2_parquet" 
COLUMNS = [
    "id","click",
    "hour_of_day","day_of_week","is_weekend",
    "site_freq","app_freq","device_freq",
    "te_site_id","te_app_id","te_device_model","te_C14",
    "hash_sitecat_devtype","hash_appcat_devtype","hash_c14_c17",
]
N_FILES = 30 # start small; bump to 120/240 later

In [2]:
fs = GcsFileSystem()

# list parquet parts in the folder
infos = fs.get_file_info(FileSelector(BUCKET_DIR, recursive=False))
parts = sorted([i.path for i in infos if i.is_file and i.path.endswith(".parquet")])
print("Parquet parts found:", len(parts))
assert parts, "No parquet parts found. Re-check your export folder."

# load a manageable subset of files + only the columns you need
t0 = time.time()
dataset_small = ds.dataset(parts[:N_FILES], filesystem=fs, format="parquet")
df = dataset_small.to_table(columns=COLUMNS).to_pandas()
print("Loaded:", df.shape, f"in {time.time()-t0:.1f}s")
df.head()


Parquet parts found: 1500
Loaded: (15374036, 15) in 11.4s


Unnamed: 0,id,click,hour_of_day,day_of_week,is_weekend,site_freq,app_freq,device_freq,te_site_id,te_app_id,te_device_model,te_C14,hash_sitecat_devtype,hash_appcat_devtype,hash_c14_c17
0,1.43198015250962e+19,0,8,2,0,14596137,21579,33358308,0.122886,0.001092,0.107136,0.002479,-4130613151412272962,518379290858953980,4808654653650998822
1,1.4342335790673269e+19,0,7,2,0,14596137,21579,33358308,0.122886,0.001092,0.109834,0.002479,-4130613151412272962,518379290858953980,4808654653650998822
2,1.5949798049261076e+19,0,7,2,0,14596137,21579,33358308,0.122886,0.001092,0.109834,0.002479,-4130613151412272962,518379290858953980,4808654653650998822
3,6.265219636813328e+18,0,10,2,0,14596137,21579,33358308,0.122886,0.001092,0.128226,0.002479,-4130613151412272962,518379290858953980,4808654653650998822
4,7.367371011948923e+18,0,10,2,0,14596137,21579,33358308,0.122886,0.001092,0.106842,0.002479,-4130613151412272962,518379290858953980,4808654653650998822


In [3]:
len(df)

15374036

### prep features + split

In [4]:
target = "click"
X = df.drop(columns=["id", target], errors="ignore").apply(pd.to_numeric, errors="coerce").fillna(0.0)
y = df[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print("Train/Test:", X_train.shape, X_test.shape, "| pos rate:", y_train.mean(), y_test.mean())


Train/Test: (12299228, 13) (3074808, 13) | pos rate: 0.0986866817982397 0.09868648709122652


### train XGBoost

In [5]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score, average_precision_score

# make features compact
X_train32 = X_train.astype("float32")
X_test32  = X_test.astype("float32")

pos = y_train.sum()
neg = y_train.shape[0] - pos
spw = float(neg) / max(float(pos), 1.0)

model = xgb.XGBClassifier(
    n_estimators=200,          # ↓ fewer trees
    learning_rate=0.1,         # ↑ faster learning
    max_depth=6,               # shallower trees
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric=["logloss","auc","aucpr"],
    tree_method="hist",
    random_state=42,
    scale_pos_weight=spw,
    n_jobs=-1,                 # use all CPU cores
)
model.fit(X_train32, y_train, eval_set=[(X_test32, y_test)], verbose=False)

proba = model.predict_proba(X_test32)[:, 1]
pred  = (proba >= 0.5).astype(int)
print("AUC     :", roc_auc_score(y_test, proba))
print("PR-AUC  :", average_precision_score(y_test, proba))
print("LogLoss :", log_loss(y_test, proba))
print("Accuracy:", accuracy_score(y_test, pred))


AUC     : 0.8348861356995055
PR-AUC  : 0.3567079322644577
LogLoss : 0.5102630613572855
Accuracy: 0.7184871380587016


In [6]:

import gcsfs
from pyarrow.fs import GcsFileSystem, FileSelector

BUCKET_DIR = "avazu-ctr/processed/fe_v2_parquet"  
N_FILES = 30  

# count shards in the folder
fs = GcsFileSystem()
infos = fs.get_file_info(FileSelector(BUCKET_DIR, recursive=False))
total_parts = sum(1 for i in infos if i.is_file and i.path.endswith(".parquet"))

used_parts = min(N_FILES, total_parts)
rows_used = len(df)  # df is your training dataframe

print(f"Used shards: {used_parts}/{total_parts} (~{used_parts/total_parts:.1%})")
print(f"Rows used in this run: {rows_used:,}")


Used shards: 30/1500 (~2.0%)
Rows used in this run: 15,374,036


## Phase 3 — Advanced Model (XGBoost with v2 Features)

### Goal:
Go beyond the baseline Logistic Regression by adding stronger features (target encoding + hashed interactions) and training an XGBoost classifier to improve ranking (AUC) and probability quality for CTR.

### Data & Sampling
#### Source table: avazu_processed.train_v2_sample (deterministic sample from the full dataset)

Modeling format: exported to Parquet at gs://avazu-ctr/processed/fe_v2_parquet/

This run (for fast iteration on current compute):

Used shards: 30 / 1500 (~2.0% of Parquet parts)

Rows used: 15,374,036

We purposely trained on a small portion of the Parquet parts to keep the dev loop responsive on the current machine.

### v2 Feature Set

- Time: hour_of_day, day_of_week, is_weekend

- Frequencies: site_freq, app_freq, device_freq

- Target Encoding (leak-safe, smoothed): te_site_id, te_app_id, te_device_model, te_C14

- Hashed Interactions: hash_sitecat_devtype, hash_appcat_devtype, hash_c14_c17

### Model & Metrics 

- Model: XGBoost (tree_method=hist, imbalance via scale_pos_weight)

- AUC: 0.835

- PR-AUC: 0.357

- LogLoss: 0.510

- Accuracy: 0.718


v2 features + XGBoost deliver a strong lift in ranking (AUC) and a meaningful drop in LogLoss. Great sign the engineered signals are useful.

### Improving Probability Quality (next steps)

Since the end-goal is better-calibrated click probabilities, here’s a simple path forward (no complexity, just practical steps):

- Calibration: add Platt (sigmoid) or isotonic calibration on a held-out slice to reduce LogLoss and sharpen probabilities.

- Early stopping + light tuning: use native xgb.train early stopping; sweep a few knobs (depth 5–7, min_child_weight 1–5, subsample/colsample 0.7–0.9, mild reg_alpha).

- More data, gradually: bump the number of Parquet parts and keep early stopping on; or resize the VM.

- Temporal validation: train on earlier dates, test on later dates to mimic production drift (more realistic lift estimates).

- Feature refinements: try time-aware target encoding (per hour/day), a couple more safe interactions, and re-check for leakage.




In [13]:
# --- CELL 1: Upload new model; locate existing endpoint ---
PROJECT = "click-through-rate-prediction"
REGION  = "us-central1"
BUCKET  = "avazu-ctr"

# set this to the NEW folder you just uploaded
ART_DIR = "models/ctr_xgb/2025-10-08T06-28-20_named"

ENDPOINT_DISPLAY = "ctr_xgb_endpoint"  
MODEL_DISPLAY    = "ctr_xgb_v2_named"   
from google.cloud import aiplatform, storage
import xgboost as xgb

# pick serving image that matches your local xgboost
ver = xgb.__version__.split(".")
major, minor = int(ver[0]), int(ver[1])
if   major >= 2 and minor >= 1:  SERVING_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.2-1:latest"
elif major >= 2:                 SERVING_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.2-0:latest"
else:                            SERVING_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-7:latest"

aiplatform.init(project=PROJECT, location=REGION)

# sanity check artifacts exist
bkt = storage.Client(project=PROJECT).bucket(BUCKET)
assert bkt.blob(f"{ART_DIR}/model.bst").exists(), "model.bst missing"
assert bkt.blob(f"{ART_DIR}/feature_list.json").exists(), "feature_list.json missing"

# upload new model (creates a fresh model resource)
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY,
    artifact_uri=f"gs://{BUCKET}/{ART_DIR}",
    serving_container_image_uri=SERVING_IMAGE,
    labels={"project":"ctr","stage":"v2","named":"true"},
)
print("Uploaded model:", model.resource_name)

# find (or create once) the endpoint by display name
eps = [e for e in aiplatform.Endpoint.list() if e.display_name == ENDPOINT_DISPLAY]
endpoint = eps[0] if eps else aiplatform.Endpoint.create(display_name=ENDPOINT_DISPLAY)
print("Endpoint:", endpoint.resource_name)

# show currently deployed models
print("Currently deployed:", [(m.id, m.display_name) for m in endpoint.list_models()])


Uploaded model: projects/87568676021/locations/us-central1/models/5156510522664812544
Endpoint: projects/87568676021/locations/us-central1/endpoints/2995591942085017600
Currently deployed: [('4978543570592989184', 'ctr_xgb_v2_named')]
