# Initial Risk Classifier Training
This notebook demonstrates how to load telemetry events from DuckDB, engineer basic features, and train a baseline LightGBM model before exporting it to ONNX.

In [None]:
import json
import duckdb
import lightgbm as lgb
import numpy as np
import pandas as pd
from pathlib import Path
from onnxmltools.convert.lightgbm import convert
from onnxmltools.utils import save_model
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
# Load telemetry events from DuckDB
conn = duckdb.connect('../cmd/ingest/../data/feature_store.duckdb')
df = conn.execute('SELECT domain, reason, risk_score, timestamp FROM telemetry_events').fetch_df()
if df.empty:
    # fabricate data for local experimentation
    rng = np.random.default_rng(0)
    df = pd.DataFrame({
        'domain': [f'example{i}.com' for i in range(500)],
        'reason': rng.choice(['ad_block', 'premium_unlock_required'], 500),
        'risk_score': rng.random(500),
    })
df['label'] = (df['reason'] == 'premium_unlock_required').astype(int)
df['domain_length'] = df['domain'].str.len()
feature_cols = ['risk_score', 'domain_length']
X = df[feature_cols]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = lgb.Dataset(X_train, label=y_train)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 31,
}
model = lgb.train(params, train_dataset, num_boost_round=50)
preds = model.predict(X_test)
auc = roc_auc_score(y_test, preds)
print(f'AUC: {auc:.3f}')


In [None]:
onnx_model = convert(model, 'risk_classifier', feature_cols)
model_dir = Path('../models/latest')
model_dir.mkdir(parents=True, exist_ok=True)
save_model(onnx_model, str(model_dir / 'model.onnx'))
(model_dir / 'metrics.json').write_text(json.dumps({'auc': float(auc)}, indent=2))
