In [1]:
import pandas as pd
import numpy as np


## 1. Data preprocessing

In [2]:
# 1) Read raw parquet files
train_df = pd.read_parquet("train.parquet")
test_df  = pd.read_parquet("test.parquet")

# 2) Convert time columns
train_df['ts'] = pd.to_datetime(train_df['ts'], unit='ms')
test_df['ts']  = pd.to_datetime(test_df['ts'], unit='ms')

train_df['registration'] = pd.to_datetime(train_df['registration'])
test_df['registration']  = pd.to_datetime(test_df['registration'])

train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time']  = pd.to_datetime(test_df['time'])

# 3) Clean string columns
train_df['page'] = train_df['page'].astype(str).str.strip()
test_df['page']  = test_df['page'].astype(str).str.strip()

train_df['location'] = train_df['location'].astype(str).str.strip()
test_df['location']  = test_df['location'].astype(str).str.strip()

# 4) Extract state from location (last two characters), e.g. "New York, NY" -> "NY"
train_df['state'] = train_df['location'].str[-2:]
test_df['state']  = test_df['location'].str[-2:]

# 5) Sort by userId and timestamp
train_df = train_df.sort_values(['userId', 'ts'])
test_df  = test_df.sort_values(['userId', 'ts'])

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)


Train shape: (17499636, 20)
Test shape : (4393179, 20)


## 2. Sliding-window labels (cutoff-based)

In [3]:
horizon_days = 10
cutoff_start = pd.to_datetime("2018-10-15")

max_ts_date = train_df['ts'].max().normalize()
cutoff_end  = max_ts_date - pd.Timedelta(days=horizon_days + 1)

print("cutoff_start:", cutoff_start.date())
print("max_ts_date :", max_ts_date.date())
print("cutoff_end  :", cutoff_end.date())

cutoff_dates = pd.date_range(start=cutoff_start, end=cutoff_end, freq="D")
print("Number of cutoff dates:", len(cutoff_dates))

# First churn time per user (if ever)
first_churn_ts = (
    train_df[train_df['page'] == "Cancellation Confirmation"]
    .groupby('userId')['ts']
    .min()
)
print("Users who ever churned:", len(first_churn_ts))

all_samples_list = []
for cutoff_date in cutoff_dates:
    # Observation period: up to cutoff (inclusive)
    obs_mask = (train_df['ts'] <= cutoff_date)

    # Prediction window: (cutoff, cutoff + horizon_days]
    fut_end  = cutoff_date + pd.Timedelta(days=horizon_days)
    fut_mask = (train_df['ts'] > cutoff_date) & (train_df['ts'] <= fut_end)

    # Users observed by cutoff
    users_obs = train_df.loc[obs_mask, 'userId'].unique()
    users_obs = np.sort(users_obs)
    if len(users_obs) == 0:
        continue

    # Keep users who are still "alive" at cutoff:
    # either never churned or churn time is after cutoff
    churn_ts_sub = first_churn_ts.reindex(users_obs)
    alive_mask = (churn_ts_sub.isna()) | (churn_ts_sub > cutoff_date)
    alive_users = users_obs[alive_mask.values]
    if len(alive_users) == 0:
        continue

    # Users who churn within the future window
    cc_future_users = (
        train_df.loc[
            fut_mask & (train_df['page'] == "Cancellation Confirmation"),
            'userId'
        ].unique()
    )
    cc_future_users = np.intersect1d(cc_future_users, alive_users)

    # Build label array
    y_array = np.zeros(len(alive_users), dtype=int)
    y_array[np.isin(alive_users, cc_future_users)] = 1

    tmp = pd.DataFrame({
        "userId": alive_users,
        "cutoff_date": cutoff_date,
        "target": y_array,
    })
    all_samples_list.append(tmp)

sliding_labels = pd.concat(all_samples_list, ignore_index=True)

sliding_labels['sample_id'] = (
    sliding_labels['userId'].astype(str)
    + "_" +
    sliding_labels['cutoff_date'].astype(str)
)
sliding_labels = sliding_labels.set_index('sample_id')
sliding_labels['cutoff_date'] = pd.to_datetime(sliding_labels['cutoff_date'])

y_train = sliding_labels['target']

print("Sliding-window samples:", len(sliding_labels))
print("Positive rate:", y_train.mean())


cutoff_start: 2018-10-15
max_ts_date : 2018-11-20
cutoff_end  : 2018-11-09
Number of cutoff dates: 26
Users who ever churned: 4271
Sliding-window samples: 400774
Positive rate: 0.050260246423171166


## 3. Static categorical features (gender/state)

In [4]:
user_static = (
    train_df
    .sort_values('ts')
    .groupby('userId')
    .agg({
        'gender': 'first',
        'state':  'first',
    })
)

sliding_labels['gender'] = sliding_labels['userId'].map(user_static['gender'])
sliding_labels['state']  = sliding_labels['userId'].map(user_static['state'])


## 4. Lifetime feature: days_since_registration

In [5]:
uid_registration = (
    train_df
    .sort_values('ts')
    .groupby('userId')['registration']
    .first()
)
sliding_labels['registration_ts'] = sliding_labels['userId'].map(uid_registration)

days_since_registration = (
    (sliding_labels['cutoff_date'] - sliding_labels['registration_ts'])
    / np.timedelta64(1, 'D')
).astype('float32').clip(lower=0)


## 5. Grouping (train / labels)

In [6]:
train_df_sorted = train_df.sort_values(['userId', 'ts']).copy()
train_groups = dict(tuple(train_df_sorted.groupby('userId')))
label_groups = dict(tuple(sliding_labels.groupby('userId')))

print("Train users:", len(train_groups))
print("Label users:", len(label_groups))


Train users: 19140
Label users: 17351


## 6. Basic behavior features (n_events / recency / recent windows / active_days / total_listen_time)

In [7]:
# --- n_events: number of events up to cutoff (inclusive)
n_events = pd.Series(0, index=sliding_labels.index, dtype='int32')
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    pos = np.searchsorted(ts_vals, cutoffs, side='right')
    n_events.loc[sample_ids] = pos.astype('int32')

# --- recency_hours: hours since last event before cutoff
recency_hours = pd.Series(np.nan, index=sliding_labels.index, dtype='float32')
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    for j, sid in enumerate(sample_ids):
        h = hi[j]
        if h == 0:
            recency_hours.loc[sid] = 9999.0
        else:
            last_ts = ts_vals[h - 1]
            recency_hours.loc[sid] = (cutoffs[j] - last_ts) / np.timedelta64(1, 'h')

# --- recent window counts: 7d / 3d / 1d (all events)
events_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')
events_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')
events_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')

seven_days  = np.timedelta64(7, 'D')
three_days  = np.timedelta64(3, 'D')
one_day     = np.timedelta64(1, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    ts_vals = train_groups[uid]['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    lo7 = np.searchsorted(ts_vals, cutoffs - seven_days, side='right')
    lo3 = np.searchsorted(ts_vals, cutoffs - three_days, side='right')
    lo1 = np.searchsorted(ts_vals, cutoffs - one_day, side='right')

    events_last_7d.loc[sample_ids] = (hi - lo7).astype('int32')
    events_last_3d.loc[sample_ids] = (hi - lo3).astype('int32')
    events_last_1d.loc[sample_ids] = (hi - lo1).astype('int32')

# --- recent window counts: 7d / 3d / 1d (songs only)
songs_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')
songs_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')
songs_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue
    ts_song = df_song['ts'].values

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    hi = np.searchsorted(ts_song, cutoffs, side='right')
    lo7 = np.searchsorted(ts_song, cutoffs - seven_days, side='right')
    lo3 = np.searchsorted(ts_song, cutoffs - three_days, side='right')
    lo1 = np.searchsorted(ts_song, cutoffs - one_day, side='right')

    songs_last_7d.loc[sample_ids] = (hi - lo7).astype('int32')
    songs_last_3d.loc[sample_ids] = (hi - lo3).astype('int32')
    songs_last_1d.loc[sample_ids] = (hi - lo1).astype('int32')

# --- active_days: number of unique active days up to cutoff
active_days = pd.Series(0, index=sliding_labels.index, dtype='int32')
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    unique_days = np.unique(df_u['ts'].dt.normalize().values)  # sorted
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    hi = np.searchsorted(unique_days, cutoffs, side='right')
    active_days.loc[sample_ids] = hi.astype('int32')

# --- total_listen_time: cumulative listening seconds up to cutoff
total_listen_time = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue

    ts_song = df_song['ts'].values
    len_song = df_song['length'].values
    cum_len = np.cumsum(len_song)

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    hi = np.searchsorted(ts_song, cutoffs, side='right')
    for j, sid in enumerate(sample_ids):
        h = hi[j]
        total_listen_time.loc[sid] = 0.0 if h == 0 else float(cum_len[h - 1])


  recency_hours.loc[sid] = (cutoffs[j] - last_ts) / np.timedelta64(1, 'h')
  total_listen_time.loc[sid] = 0.0 if h == 0 else float(cum_len[h - 1])


## 7. level_at_cutoff (last known level)

In [8]:
level_at_cutoff = pd.Series("unknown", index=sliding_labels.index, dtype=object)

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    ts_vals  = df_u['ts'].values
    lvl_vals = df_u['level'].astype(str).values

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    out = []
    for j, h in enumerate(hi):
        out.append("unknown" if h == 0 else lvl_vals[h - 1])
    level_at_cutoff.loc[sample_ids] = out

sliding_labels['level'] = level_at_cutoff


## 8. Session features (keep your original logic)

In [9]:
session_count = pd.Series(0, index=sliding_labels.index, dtype='int32')
mean_session_duration = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
max_session_duration  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
min_session_duration  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
std_session_duration  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

mean_event_count_per_session = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
max_event_count_per_session  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
min_event_count_per_session  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
std_event_count_per_session  = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid]
    sess_u = (
        df_u.groupby('sessionId')
        .agg(
            session_start=('ts', 'min'),
            session_end=('ts', 'max'),
            session_event_count=('ts', 'count'),
        )
        .sort_values('session_start')
    )
    if sess_u.empty:
        continue

    sess_start = sess_u['session_start'].values
    sess_end   = sess_u['session_end'].values
    sess_cnt   = sess_u['session_event_count'].values.astype('int32')
    sess_dur   = (sess_end - sess_start) / np.timedelta64(1, 's')

    # Ensure cutoffs are increasing (safe)
    lbl_u_sorted = lbl_u.sort_values('cutoff_date')
    cutoffs = lbl_u_sorted['cutoff_date'].values
    sample_ids = lbl_u_sorted.index

    hi_sess = np.searchsorted(sess_start, cutoffs, side='right')
    for j, sid in enumerate(sample_ids):
        h = hi_sess[j]
        if h == 0:
            session_count.loc[sid] = 0
            continue
        d = sess_dur[:h]
        c = sess_cnt[:h]

        session_count.loc[sid] = h
        mean_session_duration.loc[sid] = d.mean()
        max_session_duration.loc[sid]  = d.max()
        min_session_duration.loc[sid]  = d.min()
        std_session_duration.loc[sid]  = d.std(ddof=0)

        mean_event_count_per_session.loc[sid] = c.mean()
        max_event_count_per_session.loc[sid]  = c.max()
        min_event_count_per_session.loc[sid]  = c.min()
        std_event_count_per_session.loc[sid]  = c.std(ddof=0)


  mean_session_duration.loc[sid] = d.mean()
  std_session_duration.loc[sid]  = d.std(ddof=0)
  mean_event_count_per_session.loc[sid] = c.mean()
  std_event_count_per_session.loc[sid]  = c.std(ddof=0)


## 9. Page count + page ratio (train/test share the same all_pages universe; train is strictly per-cutoff to avoid leakage)

In [10]:
# Define all_pages as the union of train and test pages
all_pages = pd.concat([train_df['page'], test_df['page']]).astype(str).unique()
all_pages = np.sort(all_pages)  # stable column order
page2idx = {p: i for i, p in enumerate(all_pages)}
P = len(all_pages)
print("Total distinct pages (train ∪ test):", P)

# Initialize page count table for training samples
page_cnt_df = pd.DataFrame(
    0,
    index=sliding_labels.index,
    columns=[f"cnt_page_{p.replace(' ', '_').replace('/', '_')}" for p in all_pages],
    dtype='int32'
)

# For each user, one pass accumulation over events; snapshot at each cutoff
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    page_idx = df_u['page'].map(page2idx).values  # integer codes

    # Sort cutoffs so we can scan in O(events + cutoffs)
    lbl_u_sorted = lbl_u.sort_values('cutoff_date')
    cutoffs = lbl_u_sorted['cutoff_date'].values
    sample_ids = lbl_u_sorted.index

    counts = np.zeros(P, dtype=np.int32)
    k = 0
    m = len(ts_vals)

    for j, sid in enumerate(sample_ids):
        cutoff = cutoffs[j]
        while k < m and ts_vals[k] <= cutoff:
            counts[page_idx[k]] += 1
            k += 1

        # Write the full vector of page counts for this sample
        page_cnt_df.loc[sid, :] = counts

# Page ratios: divide by n_events (more stable than summing page_cnt_df)
eps = 1e-6
page_ratio_df = (page_cnt_df.div(n_events.astype(float) + eps, axis=0)).astype('float32')
page_ratio_df.columns = [c.replace("cnt_page_", "ratio_page_") for c in page_ratio_df.columns]

print("Page cnt/ratio built:", page_cnt_df.shape, page_ratio_df.shape)


Total distinct pages (train ∪ test): 22
Page cnt/ratio built: (400774, 22) (400774, 22)


## 10. Level features: n_level_change + ever_paid (no leakage: per-cutoff)

In [11]:
# n_level_change: number of free/paid switches before cutoff
n_level_change = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    lvl_vals = df_u['level'].astype(str).values
    ts_vals  = df_u['ts'].values
    if len(lvl_vals) <= 1:
        continue

    change_ts = ts_vals[1:][lvl_vals[1:] != lvl_vals[:-1]]
    if len(change_ts) == 0:
        continue

    lbl_u_sorted = lbl_u.sort_values('cutoff_date')
    cutoffs = lbl_u_sorted['cutoff_date'].values
    sample_ids = lbl_u_sorted.index

    hi_change = np.searchsorted(change_ts, cutoffs, side='right')
    n_level_change.loc[sample_ids] = hi_change.astype('int32')

# ever_paid: whether paid has appeared before cutoff
ever_paid = pd.Series(0, index=sliding_labels.index, dtype='int8')
for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]
    paid_ts = df_u.loc[df_u['level'].astype(str).values == "paid", 'ts'].values
    if len(paid_ts) == 0:
        continue

    lbl_u_sorted = lbl_u.sort_values('cutoff_date')
    cutoffs = lbl_u_sorted['cutoff_date'].values
    sample_ids = lbl_u_sorted.index

    ever_paid.loc[sample_ids] = (np.searchsorted(paid_ts, cutoffs, side='right') > 0).astype('int8')


## 11. Status code features (keep your original logic)

In [12]:
status_codes_of_interest = [200, 404, 307]
status_count_series = {code: pd.Series(0, index=sliding_labels.index, dtype='int32') for code in status_codes_of_interest}

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    df_u = train_groups[uid]

    lbl_u_sorted = lbl_u.sort_values('cutoff_date')
    cutoffs = lbl_u_sorted['cutoff_date'].values
    sample_ids = lbl_u_sorted.index

    for code in status_codes_of_interest:
        df_code = df_u[df_u['status'] == code]
        if df_code.empty:
            continue
        ts_code = df_code['ts'].values
        hi_code = np.searchsorted(ts_code, cutoffs, side='right')
        status_count_series[code].loc[sample_ids] = hi_code.astype('int32')


## 12. Merge all training features X_all (numeric + page cnt/ratio + status + level + session)

In [13]:
X_all = pd.DataFrame(index=sliding_labels.index)

X_all['days_since_registration'] = days_since_registration
X_all['n_events'] = n_events.astype('int32')
X_all['recency_hours'] = recency_hours.astype('float32')

X_all['events_last_7d'] = events_last_7d.astype('int32')
X_all['events_last_3d'] = events_last_3d.astype('int32')
X_all['events_last_1d'] = events_last_1d.astype('int32')

X_all['songs_last_7d'] = songs_last_7d.astype('int32')
X_all['songs_last_3d'] = songs_last_3d.astype('int32')
X_all['songs_last_1d'] = songs_last_1d.astype('int32')

X_all['active_days'] = active_days.astype('int32')
X_all['total_listen_time'] = total_listen_time.astype('float32')

X_all['session_count'] = session_count
X_all['mean_session_duration'] = mean_session_duration
X_all['max_session_duration']  = max_session_duration
X_all['min_session_duration']  = min_session_duration
X_all['std_session_duration']  = std_session_duration

X_all['mean_event_count_per_session'] = mean_event_count_per_session
X_all['max_event_count_per_session']  = max_event_count_per_session
X_all['min_event_count_per_session']  = min_event_count_per_session
X_all['std_event_count_per_session']  = std_event_count_per_session

# Page cnt + ratio
X_all = pd.concat([X_all, page_cnt_df, page_ratio_df], axis=1)

# Level
X_all['ever_paid'] = ever_paid
X_all['n_level_change'] = n_level_change

# Status
for code in status_codes_of_interest:
    X_all[f"n_status_{code}"] = status_count_series[code]

eps = 1e-6
X_all['frac_status_404'] = (X_all['n_status_404'] / (X_all['n_events'] + eps)).astype('float32')
X_all['frac_status_200'] = (X_all['n_status_200'] / (X_all['n_events'] + eps)).astype('float32')

print("X_all shape:", X_all.shape)


X_all shape: (400774, 71)


## 13. One-hot encode categorical features (train)

In [14]:
cat_cols = ['gender', 'state', 'level']
sliding_labels[cat_cols] = sliding_labels[cat_cols].fillna("missing")

cat_ohe = pd.get_dummies(sliding_labels[cat_cols], columns=cat_cols, prefix=cat_cols)
X_train = pd.concat([X_all, cat_ohe], axis=1)
print("X_train shape:", X_train.shape)


X_train shape: (400774, 124)


## 14. Build test features (aligned to train column space: all_pages + one-hot)

In [15]:
test_df_sorted = test_df.sort_values(['userId', 'ts']).copy()
test_groups = dict(tuple(test_df_sorted.groupby('userId')))
test_users = sorted(test_groups.keys())

global_cutoff_test = test_df_sorted['ts'].max().normalize()
print("Test global cutoff_date:", global_cutoff_test)

X_test = pd.DataFrame(index=test_users)

# days_since_registration
uid_registration_test = test_df_sorted.groupby('userId')['registration'].first()
X_test['days_since_registration'] = (
    (global_cutoff_test - uid_registration_test) / np.timedelta64(1, 'D')
).astype('float32').clip(lower=0).reindex(test_users)

# n_events / recency / active_days
n_events_test = pd.Series(0, index=test_users, dtype='int32')
recency_hours_test = pd.Series(0.0, index=test_users, dtype='float32')
active_days_test = pd.Series(0, index=test_users, dtype='int32')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        n_events_test.loc[uid] = 0
        recency_hours_test.loc[uid] = 9999.0
        active_days_test.loc[uid] = 0
        continue

    n_events_test.loc[uid] = len(df_before)
    last_ts = df_before['ts'].iloc[-1]
    recency_hours_test.loc[uid] = float((global_cutoff_test - last_ts) / np.timedelta64(1, 'h'))
    active_days_test.loc[uid] = df_before['ts'].dt.normalize().nunique()

X_test['n_events'] = n_events_test
X_test['recency_hours'] = recency_hours_test
X_test['active_days'] = active_days_test

# recent window counts: events
events_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_1d_test = pd.Series(0, index=test_users, dtype='int32')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    ts_vals = df_before['ts'].values
    events_last_7d_test.loc[uid] = int((ts_vals > (global_cutoff_test - seven_days)).sum())
    events_last_3d_test.loc[uid] = int((ts_vals > (global_cutoff_test - three_days)).sum())
    events_last_1d_test.loc[uid] = int((ts_vals > (global_cutoff_test - one_day)).sum())

X_test['events_last_7d'] = events_last_7d_test
X_test['events_last_3d'] = events_last_3d_test
X_test['events_last_1d'] = events_last_1d_test

# songs + total_listen_time
songs_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_1d_test = pd.Series(0, index=test_users, dtype='int32')
total_listen_time_test = pd.Series(0.0, index=test_users, dtype='float32')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    df_song = df_before[df_before['page'] == "NextSong"]
    if df_song.empty:
        continue
    ts_song = df_song['ts'].values
    len_song = df_song['length'].values

    total_listen_time_test.loc[uid] = float(len_song.sum())
    songs_last_7d_test.loc[uid] = int((ts_song > (global_cutoff_test - seven_days)).sum())
    songs_last_3d_test.loc[uid] = int((ts_song > (global_cutoff_test - three_days)).sum())
    songs_last_1d_test.loc[uid] = int((ts_song > (global_cutoff_test - one_day)).sum())

X_test['songs_last_7d'] = songs_last_7d_test
X_test['songs_last_3d'] = songs_last_3d_test
X_test['songs_last_1d'] = songs_last_1d_test
X_test['total_listen_time'] = total_listen_time_test

# level_at_cutoff_test
level_at_cutoff_test = pd.Series("unknown", index=test_users, dtype=object)
for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    level_at_cutoff_test.loc[uid] = str(df_before['level'].iloc[-1])

# Page cnt/ratio on test (same all_pages column space)
page_cnt_test = pd.DataFrame(
    0,
    index=test_users,
    columns=page_cnt_df.columns,
    dtype='int32'
)

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    idx = df_before['page'].map(page2idx).values
    bc = np.bincount(idx, minlength=P).astype(np.int32)
    page_cnt_test.loc[uid, :] = bc

page_ratio_test = (page_cnt_test.div(n_events_test.astype(float) + eps, axis=0)).astype('float32')
page_ratio_test.columns = [c.replace("cnt_page_", "ratio_page_") for c in page_ratio_test.columns]

# Session features (test)
X_test['session_count'] = 0
X_test['mean_session_duration'] = 0.0
X_test['max_session_duration']  = 0.0
X_test['min_session_duration']  = 0.0
X_test['std_session_duration']  = 0.0

X_test['mean_event_count_per_session'] = 0.0
X_test['max_event_count_per_session']  = 0.0
X_test['min_event_count_per_session']  = 0.0
X_test['std_event_count_per_session']  = 0.0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    sess_u = (
        df_before.groupby('sessionId')
        .agg(
            session_start=('ts', 'min'),
            session_end=('ts', 'max'),
            session_event_count=('ts', 'count'),
        )
    )
    if sess_u.empty:
        continue

    dur = ((sess_u['session_end'] - sess_u['session_start']) / np.timedelta64(1, 's')).values
    cnt = sess_u['session_event_count'].values.astype(np.int32)

    X_test.loc[uid, 'session_count'] = len(sess_u)
    X_test.loc[uid, 'mean_session_duration'] = dur.mean()
    X_test.loc[uid, 'max_session_duration']  = dur.max()
    X_test.loc[uid, 'min_session_duration']  = dur.min()
    X_test.loc[uid, 'std_session_duration']  = dur.std(ddof=0)

    X_test.loc[uid, 'mean_event_count_per_session'] = cnt.mean()
    X_test.loc[uid, 'max_event_count_per_session']  = cnt.max()
    X_test.loc[uid, 'min_event_count_per_session']  = cnt.min()
    X_test.loc[uid, 'std_event_count_per_session']  = cnt.std(ddof=0)

# Level features (test; only one global cutoff)
X_test['ever_paid'] = 0
X_test['n_level_change'] = 0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    lv = df_before['level'].astype(str).values
    X_test.loc[uid, 'ever_paid'] = 1 if (lv == 'paid').any() else 0
    if len(lv) > 1:
        X_test.loc[uid, 'n_level_change'] = int(np.sum(lv[1:] != lv[:-1]))

# Status features (test; same status_codes_of_interest)
for code in status_codes_of_interest:
    X_test[f"n_status_{code}"] = 0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    for code in status_codes_of_interest:
        X_test.loc[uid, f"n_status_{code}"] = int((df_before['status'] == code).sum())

X_test['frac_status_404'] = (X_test['n_status_404'] / (X_test['n_events'] + eps)).astype('float32')
X_test['frac_status_200'] = (X_test['n_status_200'] / (X_test['n_events'] + eps)).astype('float32')

# Merge page cnt/ratio into X_test
X_test = pd.concat([X_test, page_cnt_test, page_ratio_test], axis=1)

# Static categorical features (test): gender/state/level
test_user_static = (
    test_df_sorted.groupby('userId')
    .agg({'gender': 'first', 'state': 'first'})
)

cat_test = pd.DataFrame(index=test_users)
cat_test['gender'] = test_user_static['gender']
cat_test['state']  = test_user_static['state']
cat_test['level']  = level_at_cutoff_test
cat_test = cat_test.fillna("missing")

cat_test_ohe = pd.get_dummies(
    cat_test,
    columns=['gender', 'state', 'level'],
    prefix=['gender', 'state', 'level']
)
cat_test_ohe = cat_test_ohe.reindex(columns=cat_ohe.columns, fill_value=0)

X_test_full = pd.concat([X_test, cat_test_ohe], axis=1)

# Align test columns to train columns
X_test_full = X_test_full.reindex(columns=X_train.columns, fill_value=0)

print("X_test_full shape:", X_test_full.shape)


Test global cutoff_date: 2018-11-20 00:00:00


  recency_hours_test.loc[uid] = float((global_cutoff_test - last_ts) / np.timedelta64(1, 'h'))
  total_listen_time_test.loc[uid] = float(len_song.sum())


X_test_full shape: (2904, 124)


## 15. Heavy-tail feature transforms: log1p + ratio normalization (train/test consistent)

In [16]:
def add_tail_transforms(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    eps = 1e-6

    # 1) log1p for heavy-tailed features (counts/cumulative quantities)
    log_cols = [
        'n_events',
        'events_last_7d', 'events_last_3d', 'events_last_1d',
        'songs_last_7d', 'songs_last_3d', 'songs_last_1d',
        'total_listen_time',
        'session_count',
        'mean_session_duration', 'max_session_duration', 'min_session_duration', 'std_session_duration',
        'mean_event_count_per_session', 'max_event_count_per_session', 'min_event_count_per_session', 'std_event_count_per_session',
        'recency_hours',
    ]

    # Log-transform page/status count columns; do NOT log-transform ratio columns
    log_cols += [c for c in X.columns if c.startswith('cnt_page_')]
    log_cols += [c for c in X.columns if c.startswith('n_status_')]
    log_cols += [c for c in X.columns if c.startswith('cnt_status_')]

    for c in log_cols:
        if c in X.columns:
            X[c] = np.log1p(np.clip(X[c].astype(float), a_min=0, a_max=None)).astype('float32')

    # 2) Ratio-style normalization to control for user scale
    if 'active_days' in X.columns and 'n_events' in X.columns:
        X['events_per_active_day'] = (X['n_events'] / (X['active_days'] + eps)).astype('float32')
    if 'active_days' in X.columns and 'songs_last_7d' in X.columns:
        X['songs7_per_active_day'] = (X['songs_last_7d'] / (X['active_days'] + eps)).astype('float32')
    if 'active_days' in X.columns and 'total_listen_time' in X.columns:
        X['listen_time_per_active_day'] = (X['total_listen_time'] / (X['active_days'] + eps)).astype('float32')

    if 'days_since_registration' in X.columns and 'n_events' in X.columns:
        X['events_per_day_since_reg'] = (X['n_events'] / (X['days_since_registration'] + 1.0)).astype('float32')

    if 'total_listen_time' in X.columns and 'n_events' in X.columns:
        X['listen_time_per_event'] = (X['total_listen_time'] / (X['n_events'] + eps)).astype('float32')

    # If cnt_page_NextSong exists, add average listen time per song
    if 'total_listen_time' in X.columns and 'cnt_page_NextSong' in X.columns:
        X['listen_time_per_song'] = (X['total_listen_time'] / (X['cnt_page_NextSong'] + eps)).astype('float32')

    if 'session_count' in X.columns and 'active_days' in X.columns:
        X['sessions_per_active_day'] = (X['session_count'] / (X['active_days'] + eps)).astype('float32')

    return X

X_train = add_tail_transforms(X_train)
X_test_full = add_tail_transforms(X_test_full)
X_test_full = X_test_full.reindex(columns=X_train.columns, fill_value=0)

print("After tail transforms:")
print("X_train shape:", X_train.shape, "X_test_full shape:", X_test_full.shape)


After tail transforms:
X_train shape: (400774, 131) X_test_full shape: (2904, 131)


## 16. Model training (your original logic)

In [17]:
def oversample(X, y):
    """
    Simple oversampling: repeat minority class to roughly match majority size.
    """
    X = X.copy()
    X['target'] = y
    major = X[X['target'] == 0]
    minor = X[X['target'] == 1]
    if len(minor) == 0:
        raise ValueError("No positive samples.")
    ratio = max(1, len(major) // len(minor))
    minor_ov = pd.concat([minor] * ratio, ignore_index=True)
    df_new = pd.concat([major, minor_ov], axis=0).sample(frac=1.0, random_state=42)
    y_new = df_new['target'].values
    X_new = df_new.drop(columns=['target'])
    print("Positive rate after oversample:", y_new.mean())
    return X_new, y_new


### 16.1 LightGBM

In [18]:
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    max_depth=-1,
    random_state=42
)
lgb_clf.fit(X_train, y_train)
pred_lgb = lgb_clf.predict_proba(X_test_full)[:, 1]

example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_lgb_aligned = pd.Series(pred_lgb, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_lgb_aligned, 0.5)
pred_label = (proba_lgb_aligned >= threshold).astype(int)

submission = pd.DataFrame({"id": example_sub["id"], "target": pred_label})
submission.to_csv("submission_LightGBM.csv", index=False)
print("Saved submission_LightGBM.csv")


[LightGBM] [Info] Number of positive: 20143, number of negative: 380631
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13516
[LightGBM] [Info] Number of data points in the train set: 400774, number of used features: 121
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050260 -> initscore=-2.938974
[LightGBM] [Info] Start training from score -2.938974
Saved submission_LightGBM.csv


### 16.2 Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X_lr, y_lr = oversample(X_train, y_train.values)
scaler_lr = StandardScaler()
X_lr_scaled = scaler_lr.fit_transform(X_lr)
X_test_lr_scaled = scaler_lr.transform(X_test_full)

lr_clf = LogisticRegression(max_iter=2000, solver="liblinear")
lr_clf.fit(X_lr_scaled, y_lr)
pred_lr = lr_clf.predict_proba(X_test_lr_scaled)[:, 1]

proba_lr_aligned = pd.Series(pred_lr, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_lr_aligned, 0.5)
pred_label = (proba_lr_aligned >= threshold).astype(int)

submission = pd.DataFrame({"id": example_sub["id"], "target": pred_label})
submission.to_csv("submission_LR.csv", index=False)
print("Saved submission_LR.csv")


Positive rate after oversample: 0.48785193856338427
Saved submission_LR.csv


### 16.4 RandomForest

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_train, y_train)
pred_rf = rf_clf.predict_proba(X_test_full)[:, 1]

proba_rf_aligned = pd.Series(pred_rf, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_rf_aligned, 0.5)
pred_label = (proba_rf_aligned >= threshold).astype(int)

submission = pd.DataFrame({"id": example_sub["id"], "target": pred_label})
submission.to_csv("submission_RF.csv", index=False)
print("Saved submission_RF.csv")


Saved submission_RF.csv


## 17. Ensemble (examples: soft voting with Top-50% rule)

In [21]:
# Ensemble 1: LR + LGBM soft voting (Top 50%)
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub["id"].astype(str)

proba_lr_aligned = (
    pd.Series(pred_lr, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

proba_lgb_aligned = (
    pd.Series(pred_lgb, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

w_lr, w_lgb = 1.0, 1.0  # you can try (2,1), (3,1), (4,1), etc.
voting_proba = (w_lr * proba_lr_aligned + w_lgb * proba_lgb_aligned) / (w_lr + w_lgb)

threshold = np.quantile(voting_proba, 0.5)
print("LR + LGBM voting threshold (top50) =", threshold)

pred_label = (voting_proba >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub["id"],
    "target": pred_label
})
print(submission["target"].value_counts(normalize=True))
submission.to_csv("submission_LR_LGBM_voting.csv", index=False)
print("Saved submission_LR_LGBM_voting.csv")


LR + LGBM voting threshold (top50) = 0.3189396564927801
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_LR_LGBM_voting.csv


In [22]:
# Ensemble 2: RF + LGBM + LR weighted soft voting (Top 50%)
w_rf, w_lgb, w_lr = 1.0, 3.0, 1.0

example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub["id"].astype(str)

proba_rf_aligned = (
    pd.Series(pred_rf, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

proba_lgb_aligned = (
    pd.Series(pred_lgb, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

proba_lr_aligned = (
    pd.Series(pred_lr, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

voting_proba = (
    w_rf  * proba_rf_aligned +
    w_lgb * proba_lgb_aligned +
    w_lr  * proba_lr_aligned
) / (w_rf + w_lgb + w_lr)

threshold = np.quantile(voting_proba, 0.5)
print("RF + LGBM + LR voting threshold =", threshold)

pred_label = (voting_proba >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub["id"],
    "target": pred_label
})

print(submission["target"].value_counts(normalize=True))
submission.to_csv("submission_RF_LGBM_LR_voting_weighted.csv", index=False)
print("Saved submission_RF_LGBM_LR_voting_weighted.csv")


RF + LGBM + LR voting threshold = 0.16635179788828938
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_RF_LGBM_LR_voting_weighted.csv


In [23]:
# Ensemble 3: Sweep LR:LGBM weights (Top 50%)
weight_grid = [
    (1.0, 6.1),
    (1.0, 6.2),
    (1.0, 6.3),
    (1.0, 6.4),
    (1.0, 6.5),
    (1.0, 6.6),
    (1.0, 6.7),
    (1.0, 6.8),
    (1.0, 6.9),
]

example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub["id"].astype(str)

proba_lr_aligned = (
    pd.Series(pred_lr, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

proba_lgb_aligned = (
    pd.Series(pred_lgb, index=X_test_full.index.astype(str))
    .loc[user_ids]
    .values
)

for w_lr, w_lgb in weight_grid:
    print(f"\n=== Voting: LR:LGBM = {w_lr}:{w_lgb} ===")
    voting_proba = (w_lr * proba_lr_aligned + w_lgb * proba_lgb_aligned) / (w_lr + w_lgb)

    threshold = np.quantile(voting_proba, 0.5)
    print("threshold =", threshold)

    pred_label = (voting_proba >= threshold).astype(int)

    submission = pd.DataFrame({
        "id": example_sub["id"],
        "target": pred_label
    })

    print("positive ratio:", submission["target"].mean())

    fname = f"submission_LR_LGBM_voting_w{w_lr}_w{w_lgb}.csv"
    submission.to_csv(fname, index=False)
    print("Saved:", fname)



=== Voting: LR:LGBM = 1.0:6.1 ===
threshold = 0.13123549313526348
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.1.csv

=== Voting: LR:LGBM = 1.0:6.2 ===
threshold = 0.13017550453965482
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.2.csv

=== Voting: LR:LGBM = 1.0:6.3 ===
threshold = 0.12917160851810322
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.3.csv

=== Voting: LR:LGBM = 1.0:6.4 ===
threshold = 0.12831287384940493
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.4.csv

=== Voting: LR:LGBM = 1.0:6.5 ===
threshold = 0.12738394395856076
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.5.csv

=== Voting: LR:LGBM = 1.0:6.6 ===
threshold = 0.12642738982385338
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.6.csv

=== Voting: LR:LGBM = 1.0:6.7 ===
threshold = 0.12558149356130538
positive ratio: 0.5
Saved: submission_LR_LGBM_voting_w1.0_w6.7.csv

=== Voting: LR:LGBM = 1.0:6.8 ===
threshold = 0.1247703523720

In [24]:
# Ensemble 4: Custom weight grid for (RF, LGBM, LR)
example_sub = pd.read_csv("example_submission.csv")

# (w_rf, w_lgb, w_lr)
weight_grid = [
    (1, 3, 1),
]

for w_rf, w_lgb, w_lr in weight_grid:
    voting_proba = (
        w_rf  * proba_rf_aligned +
        w_lgb * proba_lgb_aligned +
        w_lr  * proba_lr_aligned
    ) / (w_rf + w_lgb + w_lr)

    threshold = np.quantile(voting_proba, 0.5)
    pred = (voting_proba >= threshold).astype(int)

    submission = pd.DataFrame({
        "id": example_sub["id"],
        "target": pred
    })

    fname = f"submission_RF_LGBM_LR_{w_rf}_{w_lgb}_{w_lr}.csv"
    submission.to_csv(fname, index=False)

    print(
        f"{fname} | "
        f"thr={threshold:.6f} | "
        f"pos_rate={pred.mean():.4f}"
    )

submission_RF_LGBM_LR_1_3_1.csv | thr=0.166352 | pos_rate=0.5000
