In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

# 0. Common preprocessing

In [2]:
def basic_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df[df['userId'] != 0]
    df['ts'] = pd.to_datetime(df['ts'], unit='ms')
    df['registration'] = pd.to_datetime(df['registration'])
    df['time'] = pd.to_datetime(df['time'])
    df['page'] = df['page'].astype(str).str.strip()
    df = df.sort_values(['userId', 'ts'])
    return df

# 1. Build labels + cutoff (no fixed window, churn if CC exists)

In [3]:
def build_labels_and_cutoff(df: pd.DataFrame):
    """
    Returns:
        y: Series(index=userId, values in {0,1})
        cutoff_ts: Series(index=userId), used to truncate behavior logs

    Rules:
      - If CC exists: y=1, cutoff = first CC timestamp
      - Otherwise:    y=0, cutoff = last observed timestamp
    """
    df = df.copy()
    
    cc_mask = (df['page'] == "Cancellation Confirmation")
    first_cc = (
        df[cc_mask]
        .groupby('userId')['ts']
        .min()
    )
    
    last_ts = df.groupby('userId')['ts'].max()
    all_users = last_ts.index
    
    y = pd.Series(0, index=all_users, dtype=int)
    y.loc[first_cc.index] = 1
    y.name = "target"
    
    cutoff_ts = last_ts.copy()
    cutoff_ts.loc[first_cc.index] = first_cc
    cutoff_ts.name = "cutoff_ts"
    
    return y.sort_index(), cutoff_ts.sort_index()

# 2. Feature engineering

In [4]:
def build_features(df: pd.DataFrame,
                   cutoff_ts: pd.Series,
                   pages_ref=None):
    """
    df: preprocessed raw logs (with ts, page, etc.)
    cutoff_ts: index=userId, value=cutoff time
    pages_ref:
        - training: None
        - test: pass page list from training to align columns
    """
    df = df.copy()
    
    # Map cutoff timestamp to each row
    df['cutoff_ts'] = df['userId'].map(cutoff_ts)
    
    # Remove events after cutoff
    df_obs = df[df['ts'] <= df['cutoff_ts']].copy()
    
    # Remove leakage pages (CC and Cancel)
    LEAK_PAGES = ["Cancel", "Cancellation Confirmation"]
    df_obs = df_obs[~df_obs['page'].isin(LEAK_PAGES)].copy()
    
    # All userIds
    all_users = cutoff_ts.index
    
    # -------- A. Basic attributes --------
    agg_basic = df_obs.groupby('userId').agg(
        gender=('gender', 'first'),
        level=('level', 'last'),
    )
    
    # -------- B. Activity features --------
    df_obs['date'] = df_obs['ts'].dt.date
    
    n_events = df_obs.groupby('userId').size().rename('n_events')
    n_active_days = df_obs.groupby('userId')['date'].nunique().rename('n_active_days')
    
    nextsong = df_obs[df_obs['page'] == 'NextSong']
    total_listen_time = nextsong.groupby('userId')['length'].sum().rename('total_listen_time')
    song_count = nextsong.groupby('userId').size().rename('song_count')
    
    activity_feat = pd.concat(
        [n_events, n_active_days, total_listen_time, song_count],
        axis=1
    )
    
    activity_feat['avg_song_length'] = (
        activity_feat['total_listen_time'] / (activity_feat['song_count'] + 1e-6)
    )
    activity_feat['events_per_day'] = (
        activity_feat['n_events'] / (activity_feat['n_active_days'] + 1e-6)
    )
    activity_feat['listen_time_per_day'] = (
        activity_feat['total_listen_time'] / (activity_feat['n_active_days'] + 1e-6)
    )
    
    # ===== Recent 7-day behavior (relative to cutoff) =====
    df_obs['days_to_cutoff'] = (
        (df_obs['cutoff_ts'] - df_obs['ts']).dt.total_seconds() / 86400.0
    )
    recent_mask = df_obs['days_to_cutoff'] <= 7
    
    recent_events = (
        df_obs[recent_mask]
        .groupby('userId')
        .size()
        .rename('events_last_7d')
    )
    
    recent_songs = (
        df_obs[recent_mask & (df_obs['page'] == 'NextSong')]
        .groupby('userId')
        .size()
        .rename('songs_last_7d')
    )
    
    activity_feat = pd.concat(
        [activity_feat, recent_events, recent_songs],
        axis=1
    )
    
    # -------- C. Page counts + ratios --------
    if pages_ref is None:
        pages_ref = sorted(df_obs['page'].unique())
    
    df_obs['page'] = pd.Categorical(df_obs['page'], categories=pages_ref)
    
    page_counts = (
        df_obs
        .pivot_table(index='userId',
                     columns='page',
                     values='ts',
                     aggfunc='count')
        .fillna(0)
    )
    
    page_counts['total_events_from_pages'] = page_counts.sum(axis=1)
    
    ratio_feat = page_counts.div(
        page_counts['total_events_from_pages'] + 1e-6,
        axis=0
    )
    
    page_counts = page_counts.add_prefix('cnt_page_')
    ratio_feat = ratio_feat.add_prefix('ratio_page_')
    
    # -------- D. Time-related features --------
    first_obs_ts = df_obs.groupby('userId')['ts'].min().rename('first_obs_ts')
    last_obs_ts = df_obs.groupby('userId')['ts'].max().rename('last_obs_ts')
    registration_ts = df.groupby('userId')['registration'].first().rename('registration_ts')
    
    time_feat = pd.concat([first_obs_ts, last_obs_ts, registration_ts], axis=1)
    
    time_feat['days_since_registration'] = (
        (time_feat['last_obs_ts'] - time_feat['registration_ts']).dt.days
    )
    time_feat['obs_window_days'] = (
        (time_feat['last_obs_ts'] - time_feat['first_obs_ts']).dt.days
    )
    
    time_feat['days_since_registration'] = time_feat['days_since_registration'].clip(lower=0)
    time_feat['obs_window_days'] = time_feat['obs_window_days'].clip(lower=0)
    
    # Midpoint split: first vs second half
    mid_ts = first_obs_ts + (last_obs_ts - first_obs_ts) / 2
    mid_ts.name = 'mid_ts'
    df_obs['mid_ts'] = df_obs['userId'].map(mid_ts)
    
    is_second_half = df_obs['ts'] > df_obs['mid_ts']
    
    events_first_half = (
        df_obs[~is_second_half]
        .groupby('userId')
        .size()
        .rename('events_first_half')
    )
    
    events_second_half = (
        df_obs[is_second_half]
        .groupby('userId')
        .size()
        .rename('events_second_half')
    )
    
    time_feat = pd.concat(
        [time_feat, events_first_half, events_second_half],
        axis=1
    )
    time_feat['ratio_second_to_first'] = (
        time_feat['events_second_half'] / (time_feat['events_first_half'] + 1e-6)
    )
    
    # -------- E. Payment / subscription features --------
    is_paid_last = (agg_basic['level'] == 'paid').astype(int)
    is_paid_last.name = 'is_paid_last'
    
    ever_paid = (
        df_obs.groupby('userId')['level']
        .apply(lambda s: int((s == 'paid').any()))
        .rename('ever_paid')
    )
    
    def count_level_change(s):
        s = s.dropna()
        if s.empty:
            return 0
        return (s != s.shift(1)).sum() - 1
    
    n_level_change = (
        df_obs.groupby('userId')['level']
        .apply(count_level_change)
        .rename('n_level_change')
    )
    
    # -------- F. Status code features --------
    status_counts = (
        df_obs
        .groupby(['userId', 'status'])
        .size()
        .unstack(fill_value=0)
    )
    
    status_counts = status_counts.add_prefix('status_')
    
    # Ensure 404 / 307 columns exist
    for code in [404, 307]:
        col = f'status_{code}'
        if col not in status_counts.columns:
            status_counts[col] = 0
    
    n_404 = status_counts['status_404']
    n_307 = status_counts['status_307']
    
    status_counts['frac_404'] = n_404 / (n_events + 1e-6)
    status_counts['frac_307'] = n_307 / (n_events + 1e-6)
    
    # -------- G. Session-related features --------
    session_grp = df_obs.groupby(['userId', 'sessionId'])
    
    session_stats = session_grp.agg(
        session_start=('ts', 'min'),
        session_end=('ts', 'max'),
        session_event_count=('ts', 'count'),
    )
    
    session_stats['session_duration'] = (
        session_stats['session_end'] - session_stats['session_start']
    ).dt.total_seconds()
    
    sess_user_grp = session_stats.groupby('userId')
    
    session_count = sess_user_grp.size().rename('session_count')
    
    session_duration_stats = sess_user_grp['session_duration'].agg(
        mean_session_duration='mean',
        max_session_duration='max',
        min_session_duration='min',
        std_session_duration='std',
    )
    
    session_event_stats = sess_user_grp['session_event_count'].agg(
        mean_event_count_per_session='mean',
        max_event_count_per_session='max',
        min_event_count_per_session='min',
        std_event_count_per_session='std',
    )
    
    # Idle time: first session idle = 0 (safe handling)
    session_stats_sorted = (
        session_stats
        .reset_index()
        .sort_values(['userId', 'session_start'])
    )
    
    session_stats_sorted['prev_end'] = (
        session_stats_sorted
        .groupby('userId')['session_end']
        .shift(1)
    )
    
    session_stats_sorted['idle_time'] = (
        session_stats_sorted['session_start'] - session_stats_sorted['prev_end']
    ).dt.total_seconds()
    
    session_stats_sorted['idle_time'] = session_stats_sorted['idle_time'].fillna(0)
    
    idle_stats = (
        session_stats_sorted
        .groupby('userId')['idle_time']
        .agg(
            mean_idle_time='mean',
            max_idle_time='max',
            min_idle_time='min',
            latest_idle_time='last',
        )
    )
    
    idle_stats['is_big_idle'] = (idle_stats['max_idle_time'] >= 86400).astype(int)
    idle_stats['is_very_big_idle'] = (idle_stats['max_idle_time'] >= 864000).astype(int)
    
    song_count_series = activity_feat['song_count']
    song_per_session = (song_count_series / (session_count + 1e-6)).rename('song_per_session')
    
    has_single_session = (session_count == 1).astype(int).rename("has_single_session")
    
    # -------- Merge all user-level features --------
    feats = pd.concat(
        [
            agg_basic,
            activity_feat,
            page_counts,
            ratio_feat,
            time_feat,
            status_counts,
            session_count.to_frame(),
            session_duration_stats,
            session_event_stats,
            idle_stats,
            is_paid_last.to_frame(),
            ever_paid.to_frame(),
            n_level_change.to_frame(),
            song_per_session.to_frame(),
            has_single_session.to_frame(),
        ],
        axis=1
    )
    
    feats = feats.reindex(all_users)
    feats = feats.sort_index()
    
    # Drop datetime columns
    dt_cols = feats.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
    feats = feats.drop(columns=dt_cols)
    
    return feats, pages_ref

# 3. Load training data and build X, y

In [5]:
train_raw = pd.read_parquet("train.parquet")
train_df = basic_preprocess(train_raw)

y, cutoff_ts_train = build_labels_and_cutoff(train_df)
X_train_all, pages_ref = build_features(train_df, cutoff_ts_train, pages_ref=None)

common_users = y.index.intersection(X_train_all.index)
X = X_train_all.loc[common_users]
y = y.loc[common_users]

print("Training set shape:", X.shape, " Positive rate:", y.mean())

  .pivot_table(index='userId',


Training set shape: (19140, 77)  Positive rate: 0.22314524555903867


In [6]:
X_base = X.copy()

cat_cols = [c for c in ['gender', 'level'] if c in X_base.columns]
num_cols = X_base.select_dtypes(include=['number']).columns.tolist()
num_cols = [c for c in num_cols if c not in cat_cols]

for c in cat_cols:
    X_base[c] = X_base[c].astype(str).fillna('missing')

X_base[num_cols] = X_base[num_cols].fillna(0)

In [7]:
# Build test features (shared pipeline)

test_raw = pd.read_parquet("test.parquet")
test_df = basic_preprocess(test_raw)

cutoff_ts_test = test_df.groupby('userId')['ts'].max().sort_index()
X_test_all, _ = build_features(test_df, cutoff_ts_test, pages_ref=pages_ref)

X_test = X_test_all.reindex(columns=X_base.columns)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

for c in cat_cols:
    if c in X_test.columns:
        X_test[c] = X_test[c].astype(str).fillna('missing')

X_test[num_cols] = X_test[num_cols].fillna(0)
X_test.index = X_test.index.astype(int)

  .pivot_table(index='userId',


# 4. Base preprocessing (for LR / ET)

In [8]:
# Sparse preprocessing (LR / ET)
preprocess_sparse = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# Dense preprocessing (KNN)
preprocess_dense = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
    ]
)

y_arr = y.values

# 5. Oversampling strategy

In [9]:
def oversample_dataframe(X_df: pd.DataFrame, y_sr: pd.Series, random_state: int = 42):
    """
    Simulate Spark-style oversampling:
      - Identify majority class (0) and minority class (1)
      - Replicate minority class to match majority size
      - Shuffle and return
    """
    df_xy = X_df.copy()
    df_xy['label'] = y_sr.values
    
    major_df = df_xy[df_xy['label'] == 0]
    minor_df = df_xy[df_xy['label'] == 1]
    
    if len(minor_df) == 0:
        raise ValueError("No positive samples, cannot oversample.")
    
    ratio = int(len(major_df) / len(minor_df))
    ratio = max(1, ratio)
    print("Oversample ratio:", ratio)
    
    oversampled_minor = pd.concat([minor_df] * ratio, ignore_index=True)
    
    combined = pd.concat([major_df, oversampled_minor], axis=0)
    combined = combined.sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    
    X_over = combined.drop(columns=['label'])
    y_over = combined['label'].values
    
    print("Oversampled class distribution:", np.bincount(y_over) / len(y_over))
    return X_over, y_over

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 6. Single Models

In [10]:
# Train Logistic Regression (with oversampling)

log_reg = LogisticRegression(
    max_iter=2000,
    solver='liblinear',
    class_weight=None  # already oversampled
)

pipe_lr = Pipeline([
    ('preprocess', preprocess_sparse),
    ('clf', log_reg),
])

oof_lr = np.zeros(len(X_base))

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_base, y_arr), 1):
    X_trn, X_val = X_base.iloc[trn_idx], X_base.iloc[val_idx]
    y_trn, y_val = y_arr[trn_idx], y_arr[val_idx]
    
    # Oversampling inside each CV fold
    X_trn_over, y_trn_over = oversample_dataframe(
        X_trn, pd.Series(y_trn, index=X_trn.index),
        random_state=42 + fold
    )
    
    pipe_lr.fit(X_trn_over, y_trn_over)
    pred_val = pipe_lr.predict_proba(X_val)[:, 1]
    oof_lr[val_idx] = pred_val
    
    auc = roc_auc_score(y_val, pred_val)
    print(f"[LR] Fold {fold} AUC: {auc:.4f}")

print("\n[LR] OOF AUC:", roc_auc_score(y_arr, oof_lr))

# Train final LR on full oversampled data
X_over_full, y_over_full = oversample_dataframe(X_base, y, random_state=2025)
pipe_lr.fit(X_over_full, y_over_full)
final_lr = pipe_lr

Oversample ratio: 3
Oversampled class distribution: [0.53721098 0.46278902]
[LR] Fold 1 AUC: 0.8955
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[LR] Fold 2 AUC: 0.8947
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[LR] Fold 3 AUC: 0.8896
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[LR] Fold 4 AUC: 0.8907
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[LR] Fold 5 AUC: 0.9025

[LR] OOF AUC: 0.894540723158478
Oversample ratio: 3
Oversampled class distribution: [0.53713605 0.46286395]


In [11]:
# Train ExtraTrees (with oversampling)

et_clf = ExtraTreesClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    n_jobs=-1,
    class_weight=None,
    random_state=42,
)

pipe_et = Pipeline([
    ('preprocess', preprocess_sparse),
    ('clf', et_clf),
])

oof_et = np.zeros(len(X_base))

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_base, y_arr), 1):
    X_trn, X_val = X_base.iloc[trn_idx], X_base.iloc[val_idx]
    y_trn, y_val = y_arr[trn_idx], y_arr[val_idx]
    
    X_trn_over, y_trn_over = oversample_dataframe(
        X_trn, pd.Series(y_trn, index=X_trn.index),
        random_state=100 + fold
    )
    
    pipe_et.fit(X_trn_over, y_trn_over)
    pred_val = pipe_et.predict_proba(X_val)[:, 1]
    oof_et[val_idx] = pred_val
    
    auc = roc_auc_score(y_val, pred_val)
    print(f"[ET] Fold {fold} AUC: {auc:.4f}")

print("\n[ET] OOF AUC:", roc_auc_score(y_arr, oof_et))

X_over_full_et, y_over_full_et = oversample_dataframe(X_base, y, random_state=303)
pipe_et.fit(X_over_full_et, y_over_full_et)
final_et = pipe_et

Oversample ratio: 3
Oversampled class distribution: [0.53721098 0.46278902]
[ET] Fold 1 AUC: 0.9012
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[ET] Fold 2 AUC: 0.8985
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[ET] Fold 3 AUC: 0.8973
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[ET] Fold 4 AUC: 0.8948
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[ET] Fold 5 AUC: 0.9142

[ET] OOF AUC: 0.9011470801922208
Oversample ratio: 3
Oversampled class distribution: [0.53713605 0.46286395]


In [12]:
# Train KNN (with oversampling, dense preprocessing)

knn_clf = KNeighborsClassifier(
    n_neighbors=50,
    weights='distance',
    p=2,
    n_jobs=-1,
)

pipe_knn = Pipeline([
    ('preprocess', preprocess_dense),
    ('clf', knn_clf),
])

oof_knn = np.zeros(len(X_base))

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_base, y_arr), 1):
    X_trn, X_val = X_base.iloc[trn_idx], X_base.iloc[val_idx]
    y_trn, y_val = y_arr[trn_idx], y_arr[val_idx]
    
    X_trn_over, y_trn_over = oversample_dataframe(
        X_trn, pd.Series(y_trn, index=X_trn.index),
        random_state=200 + fold
    )
    
    pipe_knn.fit(X_trn_over, y_trn_over)
    pred_val = pipe_knn.predict_proba(X_val)[:, 1]
    oof_knn[val_idx] = pred_val
    
    auc = roc_auc_score(y_val, pred_val)
    print(f"[KNN] Fold {fold} AUC: {auc:.4f}")

print("\n[KNN] OOF AUC:", roc_auc_score(y_arr, oof_knn))

X_over_full_knn, y_over_full_knn = oversample_dataframe(X_base, y, random_state=404)
pipe_knn.fit(X_over_full_knn, y_over_full_knn)
final_knn = pipe_knn

Oversample ratio: 3
Oversampled class distribution: [0.53721098 0.46278902]
[KNN] Fold 1 AUC: 0.8657
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[KNN] Fold 2 AUC: 0.8556
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[KNN] Fold 3 AUC: 0.8545
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[KNN] Fold 4 AUC: 0.8483
Oversample ratio: 3
Oversampled class distribution: [0.53711731 0.46288269]
[KNN] Fold 5 AUC: 0.8741

[KNN] OOF AUC: 0.8595239130394047
Oversample ratio: 3
Oversampled class distribution: [0.53713605 0.46286395]


In [14]:
def make_submission(name, proba, X_index):
    example_sub = pd.read_csv("example_submission.csv")
    proba_by_uid = pd.Series(proba, index=X_index)
    
    # Fixed positive rate (top-k strategy)
    pos_rate = 0.5
    n_test = len(example_sub)
    k = int(round(n_test * pos_rate))
    k = max(1, min(k, n_test - 1))
    
    topk_users = proba_by_uid.sort_values(ascending=False).index[:k]
    
    pred = pd.Series(0, index=proba_by_uid.index, dtype=int)
    pred.loc[topk_users] = 1
    
    example_sub['target'] = example_sub['id'].map(pred).fillna(0).astype(int)
    
    print(name, "target distribution:")
    print(example_sub['target'].value_counts())
    
    example_sub.to_csv(name, index=False)
    print("Saved", name)

proba_lr_test = final_lr.predict_proba(X_test)[:, 1]
proba_et_test = final_et.predict_proba(X_test)[:, 1]
proba_knn_test = final_knn.predict_proba(X_test)[:, 1]

make_submission("submission_lr_oversample_v1.csv", proba_lr_test, X_test.index)
make_submission("submission_et_oversample_v1.csv", proba_et_test, X_test.index)
make_submission("submission_knn_oversample_v1.csv", proba_knn_test, X_test.index)

submission_lr_oversample_v1.csv target distribution:
target
0    1452
1    1452
Name: count, dtype: int64
Saved submission_lr_oversample_v1.csv
submission_et_oversample_v1.csv target distribution:
target
1    1452
0    1452
Name: count, dtype: int64
Saved submission_et_oversample_v1.csv
submission_knn_oversample_v1.csv target distribution:
target
0    1452
1    1452
Name: count, dtype: int64
Saved submission_knn_oversample_v1.csv


# 7. Voting (LR + KNN)

In [15]:
# 12. Voting ensemble: LR + KNN only

print("\n===== Grid Search Voting Weights (LR + KNN) =====")

best_auc_vote_lk = -1
best_weights_lk = None
best_oof_vote_lk = None

# oof_lr, oof_knn, y_arr already computed
for w_lr in [1, 2, 3, 4]:
    for w_knn in [1, 2, 3, 4]:
        w_sum = w_lr + w_knn
        oof_vote = (w_lr * oof_lr + w_knn * oof_knn) / w_sum
        auc_vote = roc_auc_score(y_arr, oof_vote)
        if auc_vote > best_auc_vote_lk:
            best_auc_vote_lk = auc_vote
            best_weights_lk = (w_lr, w_knn)
            best_oof_vote_lk = oof_vote

print(
    f"[LR+KNN VOTING] Best OOF AUC = {best_auc_vote_lk:.4f}, "
    f"best weights (w_lr, w_knn) = {best_weights_lk}"
)

# Apply best weights to test predictions
w_lr_best, w_knn_best = best_weights_lk
w_sum_best = w_lr_best + w_knn_best

proba_voting_lr_knn = (
    w_lr_best * proba_lr_test +
    w_knn_best * proba_knn_test
) / w_sum_best

print("LR+KNN voting test_proba min/max:",
      proba_voting_lr_knn.min(), proba_voting_lr_knn.max())

make_submission(
    "submission_voting_lr_knn_oversample_v1.csv",
    proba_voting_lr_knn,
    X_test.index
)


===== Grid Search Voting Weights (LR + KNN) =====
[LR+KNN VOTING] Best OOF AUC = 0.8953, best weights (w_lr, w_knn) = (4, 1)
LR+KNN voting test_proba min/max: 0.00040484525713156027 0.9776098675508538
submission_voting_lr_knn_oversample_v1.csv target distribution:
target
0    1452
1    1452
Name: count, dtype: int64
Saved submission_voting_lr_knn_oversample_v1.csv
