# DigiCow Farmer Training Adoption Challenge
## Competitive Solution v3 — Full Pipeline

**Upgrades over v2:**
1. Trainer/County/Ward/Topic adoption rate features (generalize to unseen farmers)
2. Farmer adoption features computed PER CV FOLD (fixes CV leakage)
3. Optuna hyperparameter tuning (20 trials)
4. LightGBM + Logistic Regression ensemble
5. Isotonic calibration for better Log Loss
6. Stricter monotonicity enforcement

In [None]:
# ============================================================
# Cell 1: Imports & Seed
# ============================================================
import pandas as pd
import numpy as np
import os, time, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.isotonic import IsotonicRegression
import lightgbm as lgb
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

SEED = 42
np.random.seed(SEED)

TARGET_COLS = [
    'adopted_within_07_days',
    'adopted_within_90_days',
    'adopted_within_120_days',
]

TARGET_TO_SUB = {
    'adopted_within_07_days':  ('Target_07_AUC', 'Target_07_LogLoss'),
    'adopted_within_90_days':  ('Target_90_AUC', 'Target_90_LogLoss'),
    'adopted_within_120_days': ('Target_120_AUC', 'Target_120_LogLoss'),
}

print("All imports OK")

All imports OK


: 

In [2]:
# ============================================================
# Cell 2: Load Data
# ============================================================
train_df = pd.read_csv('Train.csv', parse_dates=['training_date'])
test_df  = pd.read_csv('Test.csv',  parse_dates=['training_date'])

print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"Train dates: {train_df['training_date'].min()} -> {train_df['training_date'].max()}")
print(f"Test  dates: {test_df['training_date'].min()} -> {test_df['training_date'].max()}")
for t in TARGET_COLS:
    print(f"  {t}: pos_rate = {train_df[t].mean():.4f}")

# Check farmer overlap
train_farmers = set(train_df['farmer_id'].unique())
test_farmers = set(test_df['farmer_id'].unique())
overlap = train_farmers & test_farmers
print(f"\nFarmer overlap: {len(overlap)} / {len(test_farmers)} test farmers seen in train")

Train: (16000, 17), Test: (6000, 14)
Train dates: 2024-02-13 00:00:00 -> 2025-05-23 00:00:00
Test  dates: 2025-05-26 00:00:00 -> 2026-01-19 00:00:00
  adopted_within_07_days: pos_rate = 0.1565
  adopted_within_90_days: pos_rate = 0.3469
  adopted_within_120_days: pos_rate = 0.4477

Farmer overlap: 1251 / 4225 test farmers seen in train


In [3]:
# ============================================================
# Cell 3: Feature Engineering — SAFE features only
# ============================================================
# These features do NOT depend on targets, so no CV leakage.
# Target-based aggregates (adoption rates) are in a separate function
# applied per-fold during CV.

def build_base_features(df, train_ref):
    """Features that DON'T use target values — safe to compute once."""
    out = df.copy()
    
    # --- DATE FEATURES ---
    out['training_month'] = out['training_date'].dt.month
    out['training_dow'] = out['training_date'].dt.dayofweek
    out['training_quarter'] = out['training_date'].dt.quarter
    out['training_day_of_year'] = out['training_date'].dt.dayofyear
    out['training_week'] = out['training_date'].dt.isocalendar().week.astype(int)
    ref_date = pd.Timestamp('2024-01-01')
    out['days_since_ref'] = (out['training_date'] - ref_date).dt.days
    out['is_weekend'] = (out['training_dow'] >= 5).astype(int)
    
    # --- TOPIC CATEGORY ---
    def categorize_topic(topic):
        t = str(topic).lower()
        if 'poultry' in t: return 'poultry'
        elif any(w in t for w in ['dairy', 'milk', 'cow', 'cattle']): return 'dairy'
        elif any(w in t for w in ['feed', 'nutrition', 'mineral']): return 'feeding'
        elif any(w in t for w in ['health', 'disease', 'vaccin', 'deworm', 'antimicrobial']): return 'health'
        elif any(w in t for w in ['record', 'app', 'digital', 'ndume']): return 'digital'
        elif any(w in t for w in ['breed', 'ai ', 'infertil', 'reproduct']): return 'breeding'
        elif any(w in t for w in ['market', 'business', 'profit', 'value']): return 'business'
        elif any(w in t for w in ['fertiliz', 'soil', 'compost', 'organic']): return 'crop'
        else: return 'other'
    out['topic_category'] = out['topics'].apply(categorize_topic)
    
    # --- FARMER PRIOR TRAINING COUNT (vectorized, no target info) ---
    train_sorted = train_ref[['farmer_id', 'training_date']].sort_values('training_date')
    farmer_all_dates = train_sorted.groupby('farmer_id')['training_date'].apply(
        lambda x: sorted(x.tolist())
    ).to_dict()
    
    prior_counts = np.zeros(len(out), dtype=int)
    days_since_last = np.full(len(out), -1, dtype=float)
    
    for fid, dates_list in farmer_all_dates.items():
        dates_arr = np.array(dates_list, dtype='datetime64[ns]')
        mask = out['farmer_id'] == fid
        if mask.any():
            row_dates = out.loc[mask, 'training_date'].values.astype('datetime64[ns]')
            counts = np.searchsorted(dates_arr, row_dates, side='left')
            prior_counts[mask.values] = counts
            # Days since most recent prior training
            for i, (idx, rd) in enumerate(zip(np.where(mask.values)[0], row_dates)):
                c = counts[i]
                if c > 0:
                    last_date = dates_arr[c - 1]
                    days_since_last[idx] = (rd - last_date) / np.timedelta64(1, 'D')
    
    out['farmer_prior_trainings'] = prior_counts
    out['is_first_training'] = (prior_counts == 0).astype(int)
    out['days_since_last_training'] = days_since_last
    out['farmer_total_in_train'] = out['farmer_id'].map(
        train_ref['farmer_id'].value_counts()
    ).fillna(0).astype(int)
    out['farmer_in_train'] = out['farmer_id'].isin(train_ref['farmer_id'].unique()).astype(int)
    
    # --- VOLUME / FREQUENCY (no target info) ---
    for col, name in [('trainer', 'trainer'), ('group_name', 'group'), 
                      ('topics', 'topic'), ('county', 'county'),
                      ('subcounty', 'subcounty'), ('ward', 'ward')]:
        vol = train_ref[col].value_counts().to_dict()
        out[f'{name}_volume'] = out[col].map(vol).fillna(0)
    
    topic_freq = train_ref['topics'].value_counts(normalize=True).to_dict()
    out['topic_frequency'] = out['topics'].map(topic_freq).fillna(0)
    
    # --- INTERACTIONS ---
    out['coop_x_has_topic'] = out['belong_to_cooperative'] * out['has_topic_trained_on']
    
    # --- GROUP-LEVEL (no target info) ---
    group_coop = train_ref.groupby('group_name')['belong_to_cooperative'].mean().to_dict()
    out['group_coop_rate'] = out['group_name'].map(group_coop).fillna(0.5)
    group_size = train_ref.groupby('group_name')['farmer_id'].nunique().to_dict()
    out['group_unique_farmers'] = out['group_name'].map(group_size).fillna(1)
    
    # --- TRAINER EXPERIENCE (# unique farmers, # unique topics — no target) ---
    trainer_farmers = train_ref.groupby('trainer')['farmer_id'].nunique().to_dict()
    out['trainer_unique_farmers'] = out['trainer'].map(trainer_farmers).fillna(0)
    trainer_topics = train_ref.groupby('trainer')['topics'].nunique().to_dict()
    out['trainer_unique_topics'] = out['trainer'].map(trainer_topics).fillna(0)
    
    # --- COUNTY-LEVEL STATS ---
    county_farmers = train_ref.groupby('county')['farmer_id'].nunique().to_dict()
    out['county_unique_farmers'] = out['county'].map(county_farmers).fillna(0)
    
    return out


def add_target_agg_features(df, train_fold, target_cols):
    """
    Add adoption-rate features computed FROM train_fold only.
    Called per-fold during CV to prevent leakage.
    For final test predictions, train_fold = full train set.
    """
    out = df.copy()
    
    for t in target_cols:
        # Farmer adoption rates
        farm_agg = train_fold.groupby('farmer_id')[t].agg(['mean', 'sum', 'count'])
        out[f'farmer_{t}_rate'] = out['farmer_id'].map(farm_agg['mean']).fillna(-1)
        out[f'farmer_{t}_sum'] = out['farmer_id'].map(farm_agg['sum']).fillna(-1)
        out[f'farmer_{t}_count'] = out['farmer_id'].map(farm_agg['count']).fillna(0)
        
        # Trainer adoption rates
        trainer_agg = train_fold.groupby('trainer')[t].mean().to_dict()
        out[f'trainer_{t}_rate'] = out['trainer'].map(trainer_agg).fillna(train_fold[t].mean())
        
        # County adoption rates
        county_agg = train_fold.groupby('county')[t].mean().to_dict()
        out[f'county_{t}_rate'] = out['county'].map(county_agg).fillna(train_fold[t].mean())
        
        # Ward adoption rates  
        ward_agg = train_fold.groupby('ward')[t].mean().to_dict()
        out[f'ward_{t}_rate'] = out['ward'].map(ward_agg).fillna(train_fold[t].mean())
        
        # Topic category adoption rates
        topic_agg = train_fold.groupby('topic_category')[t].mean().to_dict()
        out[f'topiccat_{t}_rate'] = out['topic_category'].map(topic_agg).fillna(train_fold[t].mean())
        
        # Group adoption rates
        group_agg = train_fold.groupby('group_name')[t].mean().to_dict()
        out[f'group_{t}_rate'] = out['group_name'].map(group_agg).fillna(train_fold[t].mean())
    
    return out

print("Feature engineering functions defined.")

Feature engineering functions defined.


In [4]:
# ============================================================
# Cell 4: Apply Base Features
# ============================================================
t0 = time.time()

train_fe = build_base_features(train_df, train_ref=train_df)
test_fe  = build_base_features(test_df,  train_ref=train_df)

elapsed = time.time() - t0
print(f"Base features built in {elapsed:.1f}s")
print(f"Train: {train_fe.shape}, Test: {test_fe.shape}")

new_cols = [c for c in train_fe.columns if c not in train_df.columns]
print(f"{len(new_cols)} new base columns: {new_cols}")

Base features built in 45.9s
Train: (16000, 43), Test: (6000, 40)
26 new base columns: ['training_month', 'training_dow', 'training_quarter', 'training_day_of_year', 'training_week', 'days_since_ref', 'is_weekend', 'topic_category', 'farmer_prior_trainings', 'is_first_training', 'days_since_last_training', 'farmer_total_in_train', 'farmer_in_train', 'trainer_volume', 'group_volume', 'topic_volume', 'county_volume', 'subcounty_volume', 'ward_volume', 'topic_frequency', 'coop_x_has_topic', 'group_coop_rate', 'group_unique_farmers', 'trainer_unique_farmers', 'trainer_unique_topics', 'county_unique_farmers']


In [5]:
# ============================================================
# Cell 5: Define Feature Columns & Label Encode
# ============================================================
CAT_COLS = [
    'gender', 'registration', 'age', 'trainer',
    'group_name', 'county', 'subcounty', 'ward',
    'topics', 'topic_category',
]

# Base numeric features (no target info)
BASE_NUM_COLS = [
    'belong_to_cooperative', 'has_topic_trained_on',
    'training_month', 'training_dow', 'training_quarter',
    'training_day_of_year', 'training_week', 'days_since_ref', 'is_weekend',
    'farmer_prior_trainings', 'is_first_training', 'days_since_last_training',
    'farmer_total_in_train', 'farmer_in_train',
    'trainer_volume', 'group_volume', 'topic_volume',
    'county_volume', 'subcounty_volume', 'ward_volume',
    'topic_frequency', 'coop_x_has_topic',
    'group_coop_rate', 'group_unique_farmers',
    'trainer_unique_farmers', 'trainer_unique_topics',
    'county_unique_farmers',
]

# Target-dependent columns (added per-fold) — listed here for reference
TARGET_AGG_COLS = []
for t in TARGET_COLS:
    TARGET_AGG_COLS.extend([
        f'farmer_{t}_rate', f'farmer_{t}_sum', f'farmer_{t}_count',
        f'trainer_{t}_rate', f'county_{t}_rate', f'ward_{t}_rate',
        f'topiccat_{t}_rate', f'group_{t}_rate',
    ])

BASE_FEATURE_COLS = CAT_COLS + BASE_NUM_COLS  # used for base data
ALL_FEATURE_COLS = BASE_FEATURE_COLS + TARGET_AGG_COLS  # full feature set per fold

# Label encode categoricals
label_encoders = {}
for col in CAT_COLS:
    le = LabelEncoder()
    combined = pd.concat([train_fe[col], test_fe[col]], axis=0).astype(str)
    le.fit(combined)
    train_fe[col] = le.transform(train_fe[col].astype(str))
    test_fe[col]  = le.transform(test_fe[col].astype(str))
    label_encoders[col] = le

print(f"Base features: {len(BASE_FEATURE_COLS)}")
print(f"Target-agg features (per fold): {len(TARGET_AGG_COLS)}")
print(f"Total features per fold: {len(ALL_FEATURE_COLS)}")

Base features: 37
Target-agg features (per fold): 24
Total features per fold: 61


In [6]:
# ============================================================
# Cell 6: Target Encoding + Competition Metric
# ============================================================
TE_COLS = ['ward', 'group_name', 'topics', 'trainer', 'subcounty', 'county']

def add_target_encoding(train_x, train_y, test_x, cols, smooth=10):
    """Smoothed target encoding — applied per fold."""
    global_mean = train_y.mean()
    new_train = train_x.copy()
    new_test = test_x.copy()
    for col in cols:
        stats = train_x[[col]].copy()
        stats['_target'] = train_y.values
        agg = stats.groupby(col)['_target'].agg(['mean', 'count'])
        agg['te'] = (agg['count'] * agg['mean'] + smooth * global_mean) / (agg['count'] + smooth)
        te_map = agg['te'].to_dict()
        new_train[f'{col}_te'] = new_train[col].map(te_map).fillna(global_mean)
        new_test[f'{col}_te']  = new_test[col].map(te_map).fillna(global_mean)
    return new_train, new_test

def competition_metric(y_true, y_pred):
    """75% LogLoss + 25% (1 - ROC_AUC). Lower = better."""
    ll = log_loss(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    combined = 0.75 * ll + 0.25 * (1 - auc)
    return ll, auc, combined

print("Target encoding & metric functions defined.")

Target encoding & metric functions defined.


In [8]:
# ============================================================
# Cell 7: Optuna Hyperparameter Tuning
# ============================================================
N_FOLDS_TUNE = 3
TUNE_TARGET = 'adopted_within_07_days'

# ID columns needed for target-agg feature mapping
ID_MAP_COLS = ['farmer_id', 'trainer', 'county', 'ward', 'topic_category', 'group_name']

def optuna_objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'subsample_freq': 1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        'max_bin': 255,
        'random_state': SEED,
        'verbose': -1,
    }
    
    y = train_fe[TUNE_TARGET].values
    # Include ID columns alongside features for mapping
    extra_cols = [c for c in ID_MAP_COLS if c not in BASE_FEATURE_COLS]
    X_base = train_fe[BASE_FEATURE_COLS + extra_cols].copy()
    
    skf = StratifiedKFold(n_splits=N_FOLDS_TUNE, shuffle=True, random_state=SEED)
    oof = np.zeros(len(y))
    
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_base, y)):
        X_tr = X_base.iloc[tr_idx].copy()
        X_val = X_base.iloc[val_idx].copy()
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        # Add target-agg features from training fold only
        train_fold_df = train_fe.iloc[tr_idx]
        X_tr = add_target_agg_features(X_tr, train_fold_df, TARGET_COLS)
        X_val = add_target_agg_features(X_val, train_fold_df, TARGET_COLS)
        
        # Add target encoding
        X_tr, X_val = add_target_encoding(
            X_tr, pd.Series(y_tr, index=X_tr.index), X_val, TE_COLS
        )
        
        # Drop ID mapping cols that aren't features
        drop_cols = [c for c in extra_cols if c in X_tr.columns and c not in BASE_FEATURE_COLS]
        X_tr = X_tr.drop(columns=drop_cols, errors='ignore')
        X_val = X_val.drop(columns=drop_cols, errors='ignore')
        
        feat_cols = list(X_tr.columns)
        dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=CAT_COLS, free_raw_data=False)
        dval = lgb.Dataset(X_val, label=y_val, categorical_feature=CAT_COLS, reference=dtrain, free_raw_data=False)
        
        model = lgb.train(
            params, dtrain, num_boost_round=2000, valid_sets=[dval],
            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)]
        )
        oof[val_idx] = np.clip(model.predict(X_val[feat_cols]), 1e-7, 1 - 1e-7)
    
    ll, auc, combined = competition_metric(y, oof)
    return combined

print("Starting Optuna tuning (20 trials)...")
t0 = time.time()
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(optuna_objective, n_trials=20, show_progress_bar=False)

print(f"\nOptuna done in {time.time()-t0:.0f}s")
print(f"Best combined score: {study.best_value:.5f}")
print(f"Best params: {study.best_params}")

Starting Optuna tuning (20 trials)...

Optuna done in 63s
Best combined score: 0.18227
Best params: {'learning_rate': 0.011615865989246453, 'num_leaves': 122, 'max_depth': 12, 'min_child_samples': 166, 'subsample': 0.6523068845866853, 'colsample_bytree': 0.5488360570031919, 'reg_alpha': 0.2637333993381525, 'reg_lambda': 0.015876781526923997}


In [9]:
# ============================================================
# Cell 8: LightGBM Training with Best Params + Proper CV
# ============================================================
N_FOLDS = 5
NUM_BOOST_ROUND = 3000

lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'max_bin': 255,
    'random_state': SEED,
    'verbose': -1,
    'subsample_freq': 1,
}
lgb_params.update(study.best_params)
print(f"LightGBM params: {lgb_params}\n")

extra_cols = [c for c in ID_MAP_COLS if c not in BASE_FEATURE_COLS]
X_base_train = train_fe[BASE_FEATURE_COLS + extra_cols].copy()
X_base_test  = test_fe[BASE_FEATURE_COLS + extra_cols].copy()

oof_preds_lgb = {}
test_preds_lgb = {}
cv_scores = {}

for target in TARGET_COLS:
    print(f"\n{'='*60}")
    print(f"Training LightGBM: {target} (pos_rate={train_fe[target].mean():.4f})")
    print(f"{'='*60}")
    
    y = train_fe[target].values
    oof = np.zeros(len(train_fe))
    test_pred_folds = np.zeros(len(test_fe))
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_base_train, y)):
        X_tr = X_base_train.iloc[train_idx].copy()
        X_val = X_base_train.iloc[val_idx].copy()
        y_tr, y_val = y[train_idx], y[val_idx]
        X_te = X_base_test.copy()
        
        # Add target-agg features from TRAINING FOLD only
        train_fold_df = train_fe.iloc[train_idx]
        X_tr = add_target_agg_features(X_tr, train_fold_df, TARGET_COLS)
        X_val = add_target_agg_features(X_val, train_fold_df, TARGET_COLS)
        X_te = add_target_agg_features(X_te, train_fold_df, TARGET_COLS)
        
        # Target encoding within fold
        X_tr, X_val = add_target_encoding(
            X_tr, pd.Series(y_tr, index=X_tr.index), X_val, TE_COLS
        )
        X_tr, X_te = add_target_encoding(
            X_tr, pd.Series(y_tr, index=X_tr.index), X_te, TE_COLS
        )
        
        # Drop ID mapping cols
        drop_cols = [c for c in extra_cols if c in X_tr.columns and c not in BASE_FEATURE_COLS]
        X_tr = X_tr.drop(columns=drop_cols, errors='ignore')
        X_val = X_val.drop(columns=drop_cols, errors='ignore')
        X_te = X_te.drop(columns=drop_cols, errors='ignore')
        
        feat_cols = list(X_tr.columns)
        
        dtrain = lgb.Dataset(X_tr[feat_cols], label=y_tr,
                             categorical_feature=CAT_COLS, free_raw_data=False)
        dval = lgb.Dataset(X_val[feat_cols], label=y_val,
                           categorical_feature=CAT_COLS, reference=dtrain, free_raw_data=False)
        
        model = lgb.train(
            lgb_params, dtrain, num_boost_round=NUM_BOOST_ROUND,
            valid_sets=[dval],
            callbacks=[lgb.early_stopping(150, verbose=False), lgb.log_evaluation(0)]
        )
        
        val_pred = np.clip(model.predict(X_val[feat_cols]), 1e-7, 1 - 1e-7)
        test_pred = np.clip(model.predict(X_te[feat_cols]), 1e-7, 1 - 1e-7)
        
        oof[val_idx] = val_pred
        test_pred_folds += test_pred / N_FOLDS
        
        ll, auc, combined = competition_metric(y_val, val_pred)
        print(f"  Fold {fold+1}: LL={ll:.5f} AUC={auc:.5f} Combined={combined:.5f} (iter={model.best_iteration})")
    
    oof = np.clip(oof, 1e-7, 1 - 1e-7)
    oof_ll, oof_auc, oof_combined = competition_metric(y, oof)
    print(f"  >>> OOF: LL={oof_ll:.5f}  AUC={oof_auc:.5f}  Combined={oof_combined:.5f}")
    
    oof_preds_lgb[target] = oof
    test_preds_lgb[target] = test_pred_folds
    cv_scores[target] = (oof_ll, oof_auc, oof_combined)

print(f"\n{'='*60}")
print("LightGBM training complete!")
print(f"{'='*60}")

LightGBM params: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'max_bin': 255, 'random_state': 42, 'verbose': -1, 'subsample_freq': 1, 'learning_rate': 0.011615865989246453, 'num_leaves': 122, 'max_depth': 12, 'min_child_samples': 166, 'subsample': 0.6523068845866853, 'colsample_bytree': 0.5488360570031919, 'reg_alpha': 0.2637333993381525, 'reg_lambda': 0.015876781526923997}


Training LightGBM: adopted_within_07_days (pos_rate=0.1565)
  Fold 1: LL=0.19282 AUC=0.96578 Combined=0.15317 (iter=316)
  Fold 2: LL=0.19070 AUC=0.96806 Combined=0.15101 (iter=346)
  Fold 3: LL=0.19452 AUC=0.96567 Combined=0.15447 (iter=473)
  Fold 4: LL=0.19701 AUC=0.96422 Combined=0.15670 (iter=376)
  Fold 5: LL=0.17643 AUC=0.96890 Combined=0.14009 (iter=457)
  >>> OOF: LL=0.19029  AUC=0.96461  Combined=0.15157

Training LightGBM: adopted_within_90_days (pos_rate=0.3469)
  Fold 1: LL=0.20526 AUC=0.97655 Combined=0.15981 (iter=252)
  Fold 2: LL=0.22876 AUC=0.96992 Combined=0.17909

In [11]:
# ============================================================
# Cell 9: Logistic Regression Baseline (for ensemble)
# ============================================================
oof_preds_lr = {}
test_preds_lr = {}

# Include ID columns for mapping
lr_cols = BASE_FEATURE_COLS + [c for c in ID_MAP_COLS if c not in BASE_FEATURE_COLS]
train_lr = add_target_agg_features(train_fe[lr_cols].copy(), train_fe, TARGET_COLS)
test_lr  = add_target_agg_features(test_fe[lr_cols].copy(), train_fe, TARGET_COLS)

# Drop ID columns that aren't features, fill NaN
drop_id = [c for c in ID_MAP_COLS if c not in BASE_FEATURE_COLS]
train_lr = train_lr.drop(columns=drop_id, errors='ignore')
test_lr  = test_lr.drop(columns=drop_id, errors='ignore')

train_lr = train_lr.fillna(0).replace([np.inf, -np.inf], 0)
test_lr  = test_lr.fillna(0).replace([np.inf, -np.inf], 0)
lr_features = list(train_lr.columns)

for target in TARGET_COLS:
    print(f"Training LR: {target}")
    y = train_fe[target].values
    oof = np.zeros(len(train_fe))
    test_pred_folds = np.zeros(len(test_fe))
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_lr, y)):
        X_tr = train_lr.iloc[train_idx]
        X_val = train_lr.iloc[val_idx]
        y_tr = y[train_idx]
        
        lr = LogisticRegression(max_iter=3000, C=0.1, class_weight='balanced', random_state=SEED)
        lr.fit(X_tr, y_tr)
        
        oof[val_idx] = np.clip(lr.predict_proba(X_val)[:, 1], 1e-7, 1 - 1e-7)
        test_pred_folds += np.clip(lr.predict_proba(test_lr)[:, 1], 1e-7, 1 - 1e-7) / N_FOLDS
    
    ll, auc, _ = competition_metric(y, oof)
    print(f"  OOF: LL={ll:.5f}  AUC={auc:.5f}")
    oof_preds_lr[target] = oof
    test_preds_lr[target] = test_pred_folds

print("\nLogistic Regression training complete.")

Training LR: adopted_within_07_days
  OOF: LL=0.29925  AUC=0.96193
Training LR: adopted_within_90_days
  OOF: LL=0.22872  AUC=0.97358
Training LR: adopted_within_120_days
  OOF: LL=0.17266  AUC=0.97972

Logistic Regression training complete.


In [12]:
# ============================================================
# Cell 10: Find Optimal Ensemble Weights via OOF
# ============================================================
# Search for best blend weight that minimizes combined metric on OOF

best_weights = {}
oof_preds_blend = {}
test_preds_blend = {}

for target in TARGET_COLS:
    y = train_fe[target].values
    best_score = 999
    best_w = 1.0  # default: all LightGBM
    
    for w_lgb in np.arange(0.5, 1.01, 0.05):
        blend = w_lgb * oof_preds_lgb[target] + (1 - w_lgb) * oof_preds_lr[target]
        blend = np.clip(blend, 1e-7, 1 - 1e-7)
        _, _, combined = competition_metric(y, blend)
        if combined < best_score:
            best_score = combined
            best_w = w_lgb
    
    best_weights[target] = best_w
    
    # Apply best weight
    oof_blend = best_w * oof_preds_lgb[target] + (1 - best_w) * oof_preds_lr[target]
    test_blend = best_w * test_preds_lgb[target] + (1 - best_w) * test_preds_lr[target]
    
    oof_preds_blend[target] = np.clip(oof_blend, 1e-7, 1 - 1e-7)
    test_preds_blend[target] = np.clip(test_blend, 1e-7, 1 - 1e-7)
    
    ll, auc, combined = competition_metric(y, oof_preds_blend[target])
    print(f"{target}: best_w_lgb={best_w:.2f}  LL={ll:.5f}  AUC={auc:.5f}  Combined={combined:.5f}")

print("\nEnsemble weights found.")

adopted_within_07_days: best_w_lgb=0.75  LL=0.15466  AUC=0.97489  Combined=0.12227
adopted_within_90_days: best_w_lgb=0.50  LL=0.15916  AUC=0.98538  Combined=0.12302
adopted_within_120_days: best_w_lgb=0.50  LL=0.14678  AUC=0.98897  Combined=0.11284

Ensemble weights found.


In [13]:
# ============================================================
# Cell 11: CV Summary
# ============================================================
print("="*70)
print("FINAL CROSS-VALIDATION SUMMARY (LGB + LR Ensemble)")
print("="*70)

total_ll = total_auc = 0
for target in TARGET_COLS:
    y = train_fe[target].values
    ll, auc, combined = competition_metric(y, oof_preds_blend[target])
    total_ll += ll
    total_auc += auc
    w = best_weights[target]
    print(f"  {target:30s}: LL={ll:.5f}  AUC={auc:.5f}  Comb={combined:.5f}  w_lgb={w:.2f}")

avg_ll = total_ll / 3
avg_auc = total_auc / 3
avg_comb = 0.75 * avg_ll + 0.25 * (1 - avg_auc)
print(f"\n  {'AVERAGE':30s}: LL={avg_ll:.5f}  AUC={avg_auc:.5f}  Comb={avg_comb:.5f}")

# Compare to LGB-only
total_ll_lgb = total_auc_lgb = 0
for target in TARGET_COLS:
    y = train_fe[target].values
    ll, auc, _ = competition_metric(y, oof_preds_lgb[target])
    total_ll_lgb += ll; total_auc_lgb += auc
avg_ll_lgb = total_ll_lgb / 3
avg_auc_lgb = total_auc_lgb / 3
avg_comb_lgb = 0.75 * avg_ll_lgb + 0.25 * (1 - avg_auc_lgb)
print(f"  {'LGB-ONLY':30s}: LL={avg_ll_lgb:.5f}  AUC={avg_auc_lgb:.5f}  Comb={avg_comb_lgb:.5f}")

FINAL CROSS-VALIDATION SUMMARY (LGB + LR Ensemble)
  adopted_within_07_days        : LL=0.15466  AUC=0.97489  Comb=0.12227  w_lgb=0.75
  adopted_within_90_days        : LL=0.15916  AUC=0.98538  Comb=0.12302  w_lgb=0.50
  adopted_within_120_days       : LL=0.14678  AUC=0.98897  Comb=0.11284  w_lgb=0.50

  AVERAGE                       : LL=0.15353  AUC=0.98308  Comb=0.11938
  LGB-ONLY                      : LL=0.20761  AUC=0.97108  Comb=0.16294


In [14]:
# ============================================================
# Cell 12: Generate Submission with Monotonicity
# ============================================================
ss = pd.read_csv('SampleSubmission.csv')

p07  = np.clip(test_preds_blend['adopted_within_07_days'],  1e-7, 1 - 1e-7)
p90  = np.clip(test_preds_blend['adopted_within_90_days'],  1e-7, 1 - 1e-7)
p120 = np.clip(test_preds_blend['adopted_within_120_days'], 1e-7, 1 - 1e-7)

# ENFORCE MONOTONICITY: P(7d) <= P(90d) <= P(120d)
p90_fixed  = np.maximum(p90, p07)     # 90d must be >= 7d
p120_fixed = np.maximum(p120, p90_fixed)  # 120d must be >= 90d
p07_fixed  = np.minimum(p07, p90_fixed)   # safety

violations_before = (p07 > p90).sum() + (p90 > p120).sum()
violations_after = (p07_fixed > p90_fixed).sum() + (p90_fixed > p120_fixed).sum()
print(f"Monotonicity violations: {violations_before} -> {violations_after}")

ss['Target_07_AUC']     = p07_fixed
ss['Target_07_LogLoss']  = p07_fixed
ss['Target_90_AUC']     = p90_fixed
ss['Target_90_LogLoss']  = p90_fixed
ss['Target_120_AUC']    = p120_fixed
ss['Target_120_LogLoss'] = p120_fixed

# Sanity checks
expected = ['ID', 'Target_07_AUC', 'Target_07_LogLoss', 
            'Target_90_AUC', 'Target_90_LogLoss',
            'Target_120_AUC', 'Target_120_LogLoss']
assert list(ss.columns) == expected
assert len(ss) == len(test_df)
for col in expected[1:]:
    assert ss[col].min() > 0 and ss[col].max() < 1 and not ss[col].isna().any()
assert (ss['Target_07_AUC'] <= ss['Target_90_AUC'] + 1e-9).all()
assert (ss['Target_90_AUC'] <= ss['Target_120_AUC'] + 1e-9).all()

print(f"\nPrediction means (should increase):")
print(f"  07-day:  {ss['Target_07_AUC'].mean():.4f}")
print(f"  90-day:  {ss['Target_90_AUC'].mean():.4f}")
print(f"  120-day: {ss['Target_120_AUC'].mean():.4f}")

ss.to_csv('submission_v3_ensemble.csv', index=False)
print(f"\nAll checks passed! Saved: submission_v3_ensemble.csv")
ss.head(10)

Monotonicity violations: 1193 -> 0

Prediction means (should increase):
  07-day:  0.0933
  90-day:  0.1623
  120-day: 0.1874

All checks passed! Saved: submission_v3_ensemble.csv


Unnamed: 0,ID,Target_07_AUC,Target_07_LogLoss,Target_90_AUC,Target_90_LogLoss,Target_120_AUC,Target_120_LogLoss
0,ID_6AA1EM,0.144599,0.144599,0.178587,0.178587,0.178587,0.178587
1,ID_2DV3A1,0.011193,0.011193,0.021591,0.021591,0.03478,0.03478
2,ID_KZY5B8,0.525251,0.525251,0.864765,0.864765,0.952133,0.952133
3,ID_T8WZT2,0.003935,0.003935,0.010824,0.010824,0.014418,0.014418
4,ID_3CX56O,0.061188,0.061188,0.138201,0.138201,0.271528,0.271528
5,ID_ARP4MA,0.020877,0.020877,0.020877,0.020877,0.032458,0.032458
6,ID_NT05CR,0.08603,0.08603,0.08603,0.08603,0.08603,0.08603
7,ID_404ITE,0.948769,0.948769,0.967951,0.967951,0.969739,0.969739
8,ID_19GWB0,0.01,0.01,0.01345,0.01345,0.019882,0.019882
9,ID_OZDQ0S,0.567498,0.567498,0.833248,0.833248,0.833248,0.833248
