In [None]:
import pandas as pd
import numpy as np

## 1. Preprocess

In [None]:
# 1. Read raw parquet files
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

# 3. Convert time fields
train_df['ts'] = pd.to_datetime(train_df['ts'], unit='ms')
test_df['ts'] = pd.to_datetime(test_df['ts'], unit='ms')

train_df['registration'] = pd.to_datetime(train_df['registration'])
test_df['registration'] = pd.to_datetime(test_df['registration'])

train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# 4. Clean the 'page' field (strip spaces)
train_df['page'] = train_df['page'].astype(str).str.strip()
test_df['page'] = test_df['page'].astype(str).str.strip()

# 5. Extract state from location (last 2 chars) as 'state'
#    e.g. "New York, NY" -> "NY"
train_df['location'] = train_df['location'].astype(str).str.strip()
test_df['location'] = test_df['location'].astype(str).str.strip()

train_df['state'] = train_df['location'].str[-2:]
test_df['state'] = test_df['location'].str[-2:]

# 6. Sort by userId + ts to ensure chronological order
train_df = train_df.sort_values(['userId', 'ts'])
test_df = test_df.sort_values(['userId', 'ts'])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

## 2. Window label

In [None]:
# 1. Set sliding-window parameters
horizon_days = 10  # prediction horizon length = 10 days (competition setting)

# Manually set an earliest cutoff start date
cutoff_start = pd.to_datetime("2018-10-15")

# The last cutoff must satisfy: cutoff + 10 days <= train_df['ts'].max().normalize()
# We take the max date (normalized to day) and subtract horizon_days
max_ts_date = train_df['ts'].max().normalize()          # typically 2018-11-20
cutoff_end = max_ts_date - pd.Timedelta(days=horizon_days + 1)


print("cutoff_start:", cutoff_start.date())
print("max_ts_date :", max_ts_date.date())
print("cutoff_end  :", cutoff_end.date())

In [None]:
# Generate a list of cutoff dates (one per day)
cutoff_dates = pd.date_range(start=cutoff_start, end=cutoff_end, freq="D")
print("Number of cutoff_dates:", len(cutoff_dates))
print("cutoff_dates preview:", list(cutoff_dates[:5]), "...", list(cutoff_dates[-5:]))

In [None]:
# 2. For each cutoff_date, generate samples (userId, cutoff_date, target)

# Compute each user's first churn timestamp (first time seeing Cancellation Confirmation)
first_churn_ts = (
    train_df[train_df['page'] == "Cancellation Confirmation"]
    .groupby('userId')['ts']
    .min()
)

print("Number of users who churned at least once:", len(first_churn_ts))

In [None]:
all_samples_list = []

for cutoff_date in cutoff_dates:
    print("\n==============================")
    print("Current cutoff_date =", cutoff_date.date())
    
    # ---- 2.1 Observation window & prediction window ----
    # Observation window: ts <= cutoff_date
    obs_mask = (train_df['ts'] <= cutoff_date)
    
    # Prediction window: cutoff_date < ts <= cutoff_date + horizon_days
    future_end = cutoff_date + pd.Timedelta(days=horizon_days)
    fut_mask = (
        (train_df['ts'] > cutoff_date) &
        (train_df['ts'] <= future_end)
    )
    
    # ---- 2.2 Users present in the observation window ----
    users_obs = train_df.loc[obs_mask, 'userId'].unique()
    users_obs = np.sort(users_obs)
    print("Users in observation window (including already churned):", len(users_obs))
    
    if len(users_obs) == 0:
        print("No observed users for this cutoff, skipping")
        continue

    # ---- 2.2.1 Filter out users who already churned ----
    # Reindex first_churn_ts on users_obs to get churn time or NaT
    churn_ts_sub = first_churn_ts.reindex(users_obs)
    
    # Keep if:
    #   - churn_ts is NaT  -> never churned  -> keep
    #   - churn_ts > cutoff_date -> churn in the future -> keep
    # Drop if:
    #   - churn_ts <= cutoff_date -> already churned (or churned today) -> drop
    alive_mask = (churn_ts_sub.isna()) | (churn_ts_sub > cutoff_date)
    alive_users = users_obs[alive_mask.values]
    
    print("Alive users after filtering churned:", len(alive_users))
    
    if len(alive_users) == 0:
        print("No alive users for this cutoff, skipping")
        continue
    
    # ---- 2.3 Users who click 'Cancellation Confirmation' in the prediction window ----
    cc_future_users = (
        train_df.loc[fut_mask & (train_df['page'] == "Cancellation Confirmation"), 'userId']
        .unique()
    )
    
    # Keep only users that are both alive and present in observation window
    cc_future_users = np.intersect1d(cc_future_users, alive_users)
    print("Alive users with churn(=1) in prediction window:", len(cc_future_users))
    
    # ---- 2.4 Build label array for this cutoff ----
    # alive_users is the true sample user list for this cutoff
    # Default target=0; set target=1 if user is in cc_future_users
    y_array = np.zeros(len(alive_users), dtype=int)
    pos_mask = np.isin(alive_users, cc_future_users)
    y_array[pos_mask] = 1
    
    # ---- 2.5 Assemble into a DataFrame ----
    tmp = pd.DataFrame({
        "userId": alive_users,
        "cutoff_date": cutoff_date,   # same for the whole batch
        "target": y_array,
    })
    
    print("Samples for this cutoff:", len(tmp),
          "  Positives:", tmp['target'].sum(),
          "  Positive rate:", tmp['target'].mean())
    
    all_samples_list.append(tmp)

In [None]:
# 3. Concatenate all cutoffs to obtain the sliding-window label table

sliding_labels = pd.concat(all_samples_list, ignore_index=True)

print("\n==============================")
print("Overall after merging all cutoffs:")
print("Total samples:", len(sliding_labels))
print("Total positives:", sliding_labels['target'].sum())
print("Overall positive rate:", sliding_labels['target'].mean())
print("\nTarget distribution:")
print(sliding_labels['target'].value_counts(normalize=True))

## 3. Training set features

### 3.1 Category features

In [None]:
# Static user info (one row per user; train_df is enough)
user_static = (
    train_df
    .sort_values('ts')
    .groupby('userId')
    .agg({
        'gender': 'first',   # or last; gender should not change
        'state':  'first',   # last 2 chars extracted from location
    })
)

# Map to sliding_labels
sliding_labels['gender'] = sliding_labels['userId'].map(user_static['gender'])
sliding_labels['state']  = sliding_labels['userId'].map(user_static['state'])

print("sliding_labels with gender/state:")
print(sliding_labels[['userId', 'cutoff_date', 'target', 'gender', 'state']].head())

### 3.2 Multi-Index

In [None]:
# Use userId_cutoff_date as MultiIndex (required)
sliding_labels['sample_id'] = (
    sliding_labels['userId'].astype(str)
    + "_" +
    sliding_labels['cutoff_date'].astype(str)
)

# Set index
sliding_labels = sliding_labels.set_index('sample_id')

# Target variable y_all
y_all = sliding_labels['target']

# Cutoff timestamps (direct reference)
cutoff_ts_all = sliding_labels['cutoff_date']


print("First rows of sliding_labels:")
print(sliding_labels.head())
print("First rows of y_all:")
print(y_all.head())
print("First rows of cutoff_ts_all:")
print(cutoff_ts_all.head())

### 3.3 Lifetime

In [None]:
# Registration date per user
uid_registration = (
    train_df
    .sort_values('ts')
    .groupby('userId')['registration']
    .first()
)

# Map to sliding_labels
sliding_labels['registration_ts'] = sliding_labels['userId'].map(uid_registration)

# Days from registration to cutoff
days_since_registration = (
    (sliding_labels['cutoff_date'] - sliding_labels['registration_ts'])
    / np.timedelta64(1, 'D')
).astype('float32')

# Clip negative values (rare: abnormal registration timestamp)
days_since_registration = days_since_registration.clip(lower=0)

print("days_since_registration examples:")
print(days_since_registration.head())

### 3.4 Behaviors

In [None]:
# Ensure cutoff_date is datetime
sliding_labels['cutoff_date'] = pd.to_datetime(sliding_labels['cutoff_date'])

In [None]:
# 1) Sort train_df by userId + ts to ensure per-user chronological order
train_df_sorted = train_df.sort_values(['userId', 'ts']).copy()

# 2) Group behavior data by user for train
train_groups = dict(tuple(train_df_sorted.groupby('userId')))

# 3) Group samples by user on label side (sliding_labels index is sample_id)
label_groups = dict(tuple(sliding_labels.groupby('userId')))  # user : label table

print("Number of users (train):", len(train_groups))
print("Number of users (sliding_labels):", len(label_groups))  # users appearing only after 2018-11-10 may be missing

In [None]:
# Initialize series for event counts

# Initialize: index = sample_id (sliding_labels.index), fill with 0
n_events = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    # Skip if user has no behavior in train
    if uid not in train_groups:
        continue

    # All events for this user (already sorted by ts)
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values  # datetime64[ns] array

    # All samples (cutoffs) for this user
    cutoffs = lbl_u['cutoff_date'].values  # cutoff times
    sample_ids = lbl_u.index  # sample_id list

    # For each cutoff, find number of events <= cutoff
    pos = np.searchsorted(ts_vals, cutoffs, side='right')

    # Write back to n_events aligned by sample_id
    n_events.loc[sample_ids] = pos.astype('int32')

print("n_events computed")
print(n_events.head())

### 3.5 recency_hours

In [None]:
# Definition: cutoff_date - last event time

# Initialize recency_hours (in hours)
recency_hours = pd.Series(np.nan, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values   # sorted datetime64 array
    
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # hi[j] = number of events <= cutoff_j
    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    
    # hi==0: no event before cutoff -> set large recency, e.g., 9999 hours
    # hi>0: last event index is hi-1
    for idx_in_uid, sample_id in enumerate(sample_ids):
        h = hi[idx_in_uid]
        
        if h == 0:
            # No events before cutoff -> very large recency (9999 hours ~ 416 days)
            recency_hours.loc[sample_id] = 9999.0
        else:
            last_ts = ts_vals[h-1]
            delta = (cutoffs[idx_in_uid] - last_ts)
            recency_hours.loc[sample_id] = delta / np.timedelta64(1, 'h')

print("recency_hours computed")
print(recency_hours.head())

### 3.6 events_last_7d

In [None]:
events_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')

seven_days = np.timedelta64(7, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values             # sorted datetime64 array
    cutoffs = lbl_u['cutoff_date'].values   # cutoff array
    sample_ids = lbl_u.index
    
    # hi = number of events <= cutoff
    hi = np.searchsorted(ts_vals, cutoffs, side='right')

    # lo = number of events <= cutoff - 7 days
    window_starts = cutoffs - seven_days
    lo = np.searchsorted(ts_vals, window_starts, side='right')
    
    # last 7 days events = hi - lo
    cnt_7d = hi - lo
    events_last_7d.loc[sample_ids] = cnt_7d.astype('int32')

print("events_last_7d computed")
print(events_last_7d.head())

### 3.7 songs_last_7d

In [None]:
songs_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')

seven_days = np.timedelta64(7, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Filter song-play events (NextSong)
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        # User never listened -> all zeros
        continue
    
    ts_song_vals = df_song['ts'].values  # sorted datetime array

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi = number of songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # lo = number of songs <= cutoff - 7 days
    window_starts = cutoffs - seven_days
    lo = np.searchsorted(ts_song_vals, window_starts, side='right')

    cnt_7d = hi - lo
    songs_last_7d.loc[sample_ids] = cnt_7d.astype('int32')

print("songs_last_7d computed")
print(songs_last_7d.head())

### 3.8 active_days

In [None]:
active_days = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Extract dates (keep ordering by ts)
    dates_u = df_u['ts'].dt.normalize().values  # remove time, keep date only

    # Unique dates (sorted)
    unique_days = np.unique(dates_u)
    
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # Use searchsorted on date array:
    # number of unique days <= cutoff_date
    hi = np.searchsorted(unique_days, cutoffs, side='right')

    # hi[j] is the number of active days up to cutoff
    active_days.loc[sample_ids] = hi.astype('int32')

print("active_days computed")
print(active_days.head())

### 3.9 total_listen_time

In [None]:
total_listen_time = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Filter NextSong rows
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song_vals = df_song['ts'].values         # song timestamps (sorted)
    len_song_vals = df_song['length'].values    # song durations (float)

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # Boundary for songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # Prefix sum for faster cumulative duration
    cum_len = np.cumsum(len_song_vals)

    for j, sample_id in enumerate(sample_ids):
        h = hi[j]
        if h == 0:
            total_listen_time.loc[sample_id] = 0.0
        else:
            total_listen_time.loc[sample_id] = float(cum_len[h-1])

print("total_listen_time computed")
print(total_listen_time.head())

### 3.10 events_last_1d/3d

In [None]:
events_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')
events_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')

one_day   = np.timedelta64(1, 'D')
three_days = np.timedelta64(3, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi: number of events <= cutoff (same as n_events)
    hi = np.searchsorted(ts_vals, cutoffs, side='right')

    # ---- last 1 day ----
    window_start_1d = cutoffs - one_day
    lo_1d = np.searchsorted(ts_vals, window_start_1d, side='right')
    cnt_1d = hi - lo_1d
    events_last_1d.loc[sample_ids] = cnt_1d.astype('int32')

    # ---- last 3 days ----
    window_start_3d = cutoffs - three_days
    lo_3d = np.searchsorted(ts_vals, window_start_3d, side='right')
    cnt_3d = hi - lo_3d
    events_last_3d.loc[sample_ids] = cnt_3d.astype('int32')

print("events_last_1d / 3d computed")
print(events_last_1d.head())
print(events_last_3d.head())

### 3.11 songs_last_1d/3d

In [None]:
songs_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')
songs_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')

one_day    = np.timedelta64(1, 'D')
three_days = np.timedelta64(3, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song_vals = df_song['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi: number of songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # ---- last 1 day ----
    window_start_1d = cutoffs - one_day
    lo_1d = np.searchsorted(ts_song_vals, window_start_1d, side='right')
    cnt_1d = hi - lo_1d
    songs_last_1d.loc[sample_ids] = cnt_1d.astype('int32')

    # ---- last 3 days ----
    window_start_3d = cutoffs - three_days
    lo_3d = np.searchsorted(ts_song_vals, window_start_3d, side='right')
    cnt_3d = hi - lo_3d
    songs_last_3d.loc[sample_ids] = cnt_3d.astype('int32')

print("songs_last_1d / 3d computed")
print(songs_last_1d.head())
print(songs_last_3d.head())

### 3.12 Level-at-cutoff

In [None]:
level_at_cutoff = pd.Series("unknown", index=sliding_labels.index, dtype=object)

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals   = df_u['ts'].values
    lvl_vals  = df_u['level'].astype(str).values   # level sequence (free/paid)
    
    cutoffs    = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # hi[j] = number of events <= cutoff_j
    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    
    # hi[j] == 0: no events before cutoff
    # else: last level is lvl_vals[hi[j] - 1]
    lvl_for_samples = []
    for j, h in enumerate(hi):
        if h == 0:
            lvl_for_samples.append("unknown")
        else:
            lvl_for_samples.append(lvl_vals[h-1])
    
    level_at_cutoff.loc[sample_ids] = lvl_for_samples
    
print("Level-at-cutoff computed")
print(level_at_cutoff.head())

In [None]:
# Write level_at_cutoff back to sliding_labels for one-hot later
sliding_labels['level'] = level_at_cutoff

### 3.13 Sessions

In [None]:
# Initialize series (index is still sample_id)
session_count = pd.Series(0, index=sliding_labels.index, dtype='int32')
mean_session_duration = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
max_session_duration = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
min_session_duration = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
std_session_duration = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

mean_event_count_per_session = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
max_event_count_per_session = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
min_event_count_per_session = pd.Series(0.0, index=sliding_labels.index, dtype='float32')
std_event_count_per_session = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid]

    # Group by sessionId and compute start/end/count per session
    sess_u = (
        df_u
        .groupby('sessionId')
        .agg(
            session_start=('ts', 'min'),
            session_end=('ts', 'max'),
            session_event_count=('ts', 'count'),
        )
        .sort_values('session_start')
    )

    if sess_u.empty:
        continue

    # Convert to numpy for faster searchsorted
    sess_start_vals = sess_u['session_start'].values
    sess_end_vals = sess_u['session_end'].values
    sess_event_vals = sess_u['session_event_count'].values.astype('int32')
    sess_dur_vals = (sess_end_vals - sess_start_vals) / np.timedelta64(1, 's')  # seconds

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi_sess[j] = number of sessions that started <= cutoff_j
    hi_sess = np.searchsorted(sess_start_vals, cutoffs, side='right')

    for j, sample_id in enumerate(sample_ids):
        h = hi_sess[j]
        if h == 0:
            # No sessions up to this cutoff
            session_count.loc[sample_id] = 0
            continue

        # Use the first h sessions for stats
        dur_subset = sess_dur_vals[:h]
        cnt_subset = sess_event_vals[:h]

        session_count.loc[sample_id] = h
        mean_session_duration.loc[sample_id] = dur_subset.mean()
        max_session_duration.loc[sample_id] = dur_subset.max()
        min_session_duration.loc[sample_id] = dur_subset.min()
        std_session_duration.loc[sample_id] = dur_subset.std(ddof=0)

        mean_event_count_per_session.loc[sample_id] = cnt_subset.mean()
        max_event_count_per_session.loc[sample_id] = cnt_subset.max()
        min_event_count_per_session.loc[sample_id] = cnt_subset.min()
        std_event_count_per_session.loc[sample_id] = cnt_subset.std(ddof=0)

print("Session features computed")
print(session_count.head(), mean_session_duration.head())

### 3.14 Page Ratio

In [None]:
pages_of_interest = [
    "NextSong",
    "Thumbs Up",
    "Thumbs Down",
    "Add to Playlist",
    "Roll Advert",
    "Help",
    "Error",
    "Submit Upgrade",
    "Submit Downgrade",
]

# Initialize: one Series per page
page_count_series = {
    p: pd.Series(0, index=sliding_labels.index, dtype='int32')
    for p in pages_of_interest
}

eps = 1e-6

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid]
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # For each page, build timestamp array and use searchsorted
    for p in pages_of_interest:
        df_p = df_u[df_u['page'] == p]
        if df_p.empty:
            continue

        ts_p = df_p['ts'].values
        hi_p = np.searchsorted(ts_p, cutoffs, side='right')  # count of this page <= cutoff

        page_count_series[p].loc[sample_ids] = hi_p.astype('int32')

print("Page count features computed")

### 3.15 Other level features

In [None]:
# 1) ever_paid: whether the user ever had paid in history (static)
ever_paid_uid = (
    train_df
    .groupby('userId')['level']
    .apply(lambda s: int((s.astype(str) == "paid").any()))
)

ever_paid = sliding_labels['userId'].map(ever_paid_uid).fillna(0).astype('int8')

# 2) n_level_change: number of free/paid switches up to cutoff
n_level_change = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid].copy()
    # level may have NaN; cast to string
    lvl_vals = df_u['level'].astype(str).values
    ts_vals = df_u['ts'].values

    if len(lvl_vals) <= 1:
        continue

    # Find timestamps where level changes
    change_mask = lvl_vals[1:] != lvl_vals[:-1]
    change_ts = ts_vals[1:][change_mask]

    if len(change_ts) == 0:
        continue

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi_change[j] = number of level changes <= cutoff_j
    hi_change = np.searchsorted(change_ts, cutoffs, side='right')
    n_level_change.loc[sample_ids] = hi_change.astype('int32')

print("Level features computed")
print(ever_paid.head(), n_level_change.head())

### 3.16 Status

In [None]:
status_codes_of_interest = [200, 404, 307]

status_count_series = {
    code: pd.Series(0, index=sliding_labels.index, dtype='int32')
    for code in status_codes_of_interest
}

eps = 1e-6

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue

    df_u = train_groups[uid]
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    for code in status_codes_of_interest:
        df_code = df_u[df_u['status'] == code]
        if df_code.empty:
            continue

        ts_code = df_code['ts'].values
        hi_code = np.searchsorted(ts_code, cutoffs, side='right')

        status_count_series[code].loc[sample_ids] = hi_code.astype('int32')

print("Status count features computed")

### 3.13 Combining the feature

In [None]:
# Empty feature table; ensure X_all index is sample_id
X_all = pd.DataFrame(index=sliding_labels.index)

# 1) Lifecycle
X_all['days_since_registration'] = days_since_registration

# 2) Cumulative event count
X_all['n_events'] = n_events.astype('int32')

# 3) Time since last event (hours)
X_all['recency_hours'] = recency_hours.astype('float32')

# 4) Last 7 days events and songs
X_all['events_last_7d'] = events_last_7d.astype('int32')
X_all['songs_last_7d']  = songs_last_7d.astype('int32')

# 5) Active days
X_all['active_days'] = active_days.astype('int32')

# 6) Total listening time
X_all['total_listen_time'] = total_listen_time.astype('float32')

# 7) Last 1 day / 3 days event counts
X_all['events_last_1d'] = events_last_1d.astype('int32')
X_all['events_last_3d'] = events_last_3d.astype('int32')

# 8) Last 1 day / 3 days song counts
X_all['songs_last_1d'] = songs_last_1d.astype('int32')
X_all['songs_last_3d'] = songs_last_3d.astype('int32')

# Session-related features
X_all['session_count']                = session_count
X_all['mean_session_duration']        = mean_session_duration
X_all['max_session_duration']         = max_session_duration
X_all['min_session_duration']         = min_session_duration
X_all['std_session_duration']         = std_session_duration

X_all['mean_event_count_per_session'] = mean_event_count_per_session
X_all['max_event_count_per_session']  = max_event_count_per_session
X_all['min_event_count_per_session']  = min_event_count_per_session
X_all['std_event_count_per_session']  = std_event_count_per_session

# Add page counts and ratio features
for p in pages_of_interest:
    safe_name = p.replace(" ", "_").replace("/", "_")
    col_cnt   = f"cnt_page_{safe_name}"
    col_ratio = f"ratio_page_{safe_name}"
    
    X_all[col_cnt] = page_count_series[p]
    # ratio = page count / total events (n_events)
    X_all[col_ratio] = (
        X_all[col_cnt] / (X_all['n_events'] + eps)
    ).astype('float32')

print("Page count + ratio features added to X_all")

# Paid-related
X_all['ever_paid']      = ever_paid
X_all['n_level_change'] = n_level_change

# Status-related
for code in status_codes_of_interest:
    col_cnt = f"n_status_{code}"
    X_all[col_cnt] = status_count_series[code]

# 404 fraction
X_all['frac_status_404'] = (
    X_all['n_status_404'] / (X_all['n_events'] + eps)
).astype('float32')

# 200 fraction (optional)
X_all['frac_status_200'] = (
    X_all['n_status_200'] / (X_all['n_events'] + eps)
).astype('float32')

print("Status features added to X_all")
print(X_all[['n_status_404', 'frac_status_404']].head())


print("Numeric features merged into X_all")
print("X_all shape:", X_all.shape)
print(X_all.head())
print("\nX_all dtypes:")
print(X_all.dtypes)

### 3.14 Category + One-Hot

In [None]:
# 4. One-hot encode categorical features on sliding_labels
cat_cols = ['gender', 'state', 'level']
sliding_labels[cat_cols] = sliding_labels[cat_cols].fillna("missing")

cat_ohe = pd.get_dummies(
    sliding_labels[cat_cols],
    columns=cat_cols,
    prefix=cat_cols
)

print("Categorical one-hot feature example:")
print(cat_ohe.head())

# Merge
X_train = pd.concat([X_all, cat_ohe], axis=1)

print("X_train shape:", X_train.shape)
print("X_train example:")
print(X_train.head())

# y_train
y_train = sliding_labels['target']

In [None]:
X_train.head(100)

In [None]:
y_train.head(100)

## 4. Test set features

In [None]:
# 4.1 Sort by userId + ts, then group
test_df_sorted = test_df.sort_values(['userId', 'ts']).copy()
test_groups = dict(tuple(test_df_sorted.groupby('userId')))
test_users = sorted(test_groups.keys())

print("Number of test users:", len(test_users))

# 4.2 Global cutoff for test (observation end): use last day in test
global_cutoff_test = test_df_sorted['ts'].max().normalize()
print("Test global cutoff_date:", global_cutoff_test)

# 4.3 Initialize feature table: one sample per user
X_test = pd.DataFrame(index=test_users)


### 4.1 Lifetime

In [None]:

# Registration time per user (from test)
uid_registration_test = (
    test_df_sorted
    .groupby('userId')['registration']
    .first()
)

# Days from registration to cutoff
days_since_registration_test = (
    (global_cutoff_test - uid_registration_test) / np.timedelta64(1, 'D')
).astype('float32')
days_since_registration_test = days_since_registration_test.clip(lower=0)

X_test['days_since_registration'] = days_since_registration_test.reindex(test_users)

print("days_since_registration_test example:")
print(X_test['days_since_registration'].head())


### 4.2 n_events / recency_hours / active_days

In [None]:

n_events_test = pd.Series(0, index=test_users, dtype='int32')
recency_hours_test = pd.Series(0.0, index=test_users, dtype='float32')
active_days_test = pd.Series(0, index=test_users, dtype='int32')

for uid, df_u in test_groups.items():
    # Only consider events <= cutoff
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    
    if df_before.empty:
        n_events_test.loc[uid] = 0
        recency_hours_test.loc[uid] = 9999.0
        active_days_test.loc[uid] = 0
        continue
    
    # Total events
    n_events_test.loc[uid] = len(df_before)
    
    # recency_hours: cutoff - last event
    last_ts = df_before['ts'].iloc[-1]
    delta_h = (global_cutoff_test - last_ts) / np.timedelta64(1, 'h')
    recency_hours_test.loc[uid] = float(delta_h)
    
    # Active days: number of distinct dates
    active_days_test.loc[uid] = df_before['ts'].dt.normalize().nunique()

X_test['n_events'] = n_events_test
X_test['recency_hours'] = recency_hours_test
X_test['active_days'] = active_days_test

print("n_events / recency_hours / active_days examples:")
print(X_test[['n_events', 'recency_hours', 'active_days']].head())


### 4.3 recent 7 / 3 / 1 behavior

In [None]:

events_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_1d_test = pd.Series(0, index=test_users, dtype='int32')

seven_days  = np.timedelta64(7, 'D')
three_days  = np.timedelta64(3, 'D')
one_day     = np.timedelta64(1, 'D')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    
    ts_vals = df_before['ts'].values
    
    # Last 7 days
    mask_7 = ts_vals > (global_cutoff_test - seven_days)
    events_last_7d_test.loc[uid] = int(mask_7.sum())
    
    # Last 3 days
    mask_3 = ts_vals > (global_cutoff_test - three_days)
    events_last_3d_test.loc[uid] = int(mask_3.sum())
    
    # Last 1 day
    mask_1 = ts_vals > (global_cutoff_test - one_day)
    events_last_1d_test.loc[uid] = int(mask_1.sum())

X_test['events_last_7d'] = events_last_7d_test
X_test['events_last_3d'] = events_last_3d_test
X_test['events_last_1d'] = events_last_1d_test

print("events_last_*_test examples:")
print(X_test[['events_last_7d', 'events_last_3d', 'events_last_1d']].head())


### 4.4 recent 7 / 3 / 1 songs

In [None]:

songs_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_1d_test = pd.Series(0, index=test_users, dtype='int32')
total_listen_time_test = pd.Series(0.0, index=test_users, dtype='float32')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    
    df_song = df_before[df_before['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song = df_song['ts'].values
    len_song = df_song['length'].values
    
    # Total listening time (all songs <= cutoff)
    total_listen_time_test.loc[uid] = float(len_song.sum())
    
    # Last 7 days song count
    mask_7 = ts_song > (global_cutoff_test - seven_days)
    songs_last_7d_test.loc[uid] = int(mask_7.sum())
    
    # Last 3 days
    mask_3 = ts_song > (global_cutoff_test - three_days)
    songs_last_3d_test.loc[uid] = int(mask_3.sum())
    
    # Last 1 day
    mask_1 = ts_song > (global_cutoff_test - one_day)
    songs_last_1d_test.loc[uid] = int(mask_1.sum())

X_test['songs_last_7d']     = songs_last_7d_test
X_test['songs_last_3d']     = songs_last_3d_test
X_test['songs_last_1d']     = songs_last_1d_test
X_test['total_listen_time'] = total_listen_time_test

print("songs_last_*_test & total_listen_time_test examples:")
print(X_test[['songs_last_7d', 'songs_last_3d', 'songs_last_1d', 'total_listen_time']].head())


### 4.5 Level-at-cutoff

In [None]:
level_at_cutoff_test = pd.Series("unknown", index=test_users, dtype=object)

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    level_at_cutoff_test.loc[uid] = str(df_before['level'].iloc[-1])

print("level_at_cutoff_test example:")
print(level_at_cutoff_test.head())


### 4.6 Page Count + Page Ratio

In [None]:

# Universe of pages (union of train and test)
all_pages = pd.concat([train_df['page'], test_df['page']]).unique()

# Initialize features (two per page: count_XXX, ratio_XXX)
for p in all_pages:
    cname = f"cnt_page_{p.replace(' ', '_')}"
    X_test[cname] = 0

# Ratio features
for p in all_pages:
    cname = f"ratio_page_{p.replace(' ', '_')}"
    X_test[cname] = 0.0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    # Page counts
    page_counts = df_before['page'].value_counts()

    for p, cnt in page_counts.items():
        cname = f"cnt_page_{p.replace(' ', '_')}"
        if cname in X_test.columns:
            X_test.loc[uid, cname] = cnt

    total_events = len(df_before)
    if total_events > 0:
        for p, cnt in page_counts.items():
            rname = f"ratio_page_{p.replace(' ', '_')}"
            if rname in X_test.columns:
                X_test.loc[uid, rname] = cnt / total_events

### 4.7 Status Count + Status Ratio

In [None]:

# Universe of status codes
all_status = pd.concat([train_df['status'], test_df['status']]).dropna().unique()

# Initialize
for s in all_status:
    X_test[f"cnt_status_{s}"] = 0
    X_test[f"ratio_status_{s}"] = 0.0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    status_counts = df_before['status'].value_counts()
    total_events = len(df_before)

    for s, cnt in status_counts.items():
        cname = f"cnt_status_{s}"
        rname = f"ratio_status_{s}"
        X_test.loc[uid, cname] = cnt
        X_test.loc[uid, rname] = cnt / total_events

### 4.8 Level Features

In [None]:
X_test['is_paid_last'] = 0
X_test['ever_paid'] = 0
X_test['n_level_change'] = 0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    levels = df_before['level'].astype(str).values

    # Last level
    X_test.loc[uid, 'is_paid_last'] = 1 if levels[-1] == 'paid' else 0

    # Ever paid in history
    X_test.loc[uid, 'ever_paid'] = 1 if 'paid' in levels else 0

    # Number of level switches
    if len(levels) > 1:
        X_test.loc[uid, 'n_level_change'] = np.sum(levels[1:] != levels[:-1])

### 4.9 Session Duration Features

In [None]:
X_test['session_mean_duration'] = 0.0
X_test['session_max_duration']  = 0.0
X_test['session_min_duration']  = 0.0
X_test['session_std_duration']  = 0.0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    session_grp = df_before.groupby('sessionId')

    durations = []
    for sid, g in session_grp:
        start = g['ts'].min()
        end   = g['ts'].max()
        durations.append((end - start) / np.timedelta64(1, 's'))   # seconds

    if len(durations) > 0:
        durations = np.array(durations)
        X_test.loc[uid, 'session_mean_duration'] = durations.mean()
        X_test.loc[uid, 'session_max_duration']  = durations.max()
        X_test.loc[uid, 'session_min_duration']  = durations.min()
        X_test.loc[uid, 'session_std_duration']  = durations.std()

### 4.10 Session Event Count Features

In [None]:

X_test['session_mean_events'] = 0.0
X_test['session_max_events']  = 0.0
X_test['session_min_events']  = 0.0
X_test['session_std_events']  = 0.0

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue

    session_grp = df_before.groupby('sessionId')
    event_counts = session_grp.size().values

    if len(event_counts) > 0:
        event_counts = np.array(event_counts)
        X_test.loc[uid, 'session_mean_events'] = event_counts.mean()
        X_test.loc[uid, 'session_max_events']  = event_counts.max()
        X_test.loc[uid, 'session_min_events']  = event_counts.min()
        X_test.loc[uid, 'session_std_events']  = event_counts.std()

### 4.11 Category + One-Hot

In [None]:
# Static gender/state for test
test_user_static = (
    test_df_sorted
    .groupby('userId')
    .agg({
        'gender': 'first',
        'state':  'first',
    })
)

cat_test = pd.DataFrame(index=test_users)
cat_test['gender'] = test_user_static['gender']
cat_test['state']  = test_user_static['state']
cat_test['level']  = level_at_cutoff_test

cat_test = cat_test.fillna("missing")

cat_test_ohe = pd.get_dummies(
    cat_test,
    columns=['gender', 'state', 'level'],
    prefix=['gender', 'state', 'level']
)

print("Test categorical one-hot example:")
print(cat_test_ohe.head())

# Align to train categorical columns
cat_test_ohe = cat_test_ohe.reindex(columns=cat_ohe.columns, fill_value=0)

print("cat_test_ohe shape after aligning to train:", cat_test_ohe.shape)

### 4.12 Combining features

In [None]:
# Final test features (align column order)
X_test_full = pd.concat([X_test, cat_test_ohe], axis=1)
X_test_full = X_test_full.reindex(columns=X_train.columns, fill_value=0)

print("X_test_full shape:", X_test_full.shape)
print(X_test_full.head())

In [None]:
X_test_full.head(100)

## 5. Models

In [None]:
# Simple oversampling helper
def oversample(X, y):
    X = X.copy()
    X['target'] = y
    major = X[X['target'] == 0]
    minor = X[X['target'] == 1]

    if len(minor) == 0:
        raise ValueError("No positive samples")

    ratio = max(1, len(major) // len(minor))
    minor_ov = pd.concat([minor] * ratio, ignore_index=True)

    df_new = pd.concat([major, minor_ov], axis=0).sample(frac=1.0, random_state=42)
    y_new = df_new['target'].values
    X_new = df_new.drop(columns=['target'])

    print("Positive rate after oversampling:", y_new.mean())
    return X_new, y_new

## 6. Grid Search 

In [None]:
### A. Grid Search for Logistic Regression
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score


def cv_auc_lr(params, X, y, n_splits=3, random_state=42):
    """
    For a given set of Logistic Regression parameters,
    compute the mean AUC using K-fold cross-validation with oversampling.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    aucs = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        X_tr = X.iloc[tr_idx].copy()
        y_tr = y.iloc[tr_idx].copy()
        X_val = X.iloc[val_idx].copy()
        y_val = y.iloc[val_idx].copy()

        # Apply oversampling only on the training fold to avoid data leakage
        X_tr_os, y_tr_os = oversample(X_tr, y_tr.values)

        scaler = StandardScaler()
        X_tr_os_scaled = scaler.fit_transform(X_tr_os)
        X_val_scaled = scaler.transform(X_val)

        clf = LogisticRegression(
            max_iter=2000,
            solver="liblinear",
            **params
        )
        clf.fit(X_tr_os_scaled, y_tr_os)

        val_proba = clf.predict_proba(X_val_scaled)[:, 1]
        auc = roc_auc_score(y_val, val_proba)
        aucs.append(auc)

    return np.mean(aucs)


# A relatively small parameter grid to keep computation time reasonable
lr_param_grid = [
    {"C": 0.1, "class_weight": None},
    {"C": 0.3, "class_weight": None},
    {"C": 1.0, "class_weight": None},
    {"C": 3.0, "class_weight": None},
    {"C": 0.3, "class_weight": "balanced"},
    {"C": 1.0, "class_weight": "balanced"},
    {"C": 3.0, "class_weight": "balanced"},
]

best_lr_auc = -1
best_lr_params = None

for params in lr_param_grid:
    mean_auc = cv_auc_lr(params, X_train, y_train)
    print(f"LR params={params}, mean AUC={mean_auc:.6f}")
    if mean_auc > best_lr_auc:
        best_lr_auc = mean_auc
        best_lr_params = params

print("\n>>> Best LR params:", best_lr_params)
print(">>> Best LR CV AUC:", best_lr_auc)


In [None]:
### B. Grid Search for Random Forest
from sklearn.ensemble import RandomForestClassifier


def cv_auc_rf(params, X, y, n_splits=3, random_state=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    aucs = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        X_tr = X.iloc[tr_idx].copy()
        y_tr = y.iloc[tr_idx].copy()
        X_val = X.iloc[val_idx].copy()
        y_val = y.iloc[val_idx].copy()

        # Oversampling on the training fold for RF (optional)
        X_tr_os, y_tr_os = oversample(X_tr, y_tr.values)

        clf = RandomForestClassifier(
            n_estimators=300,  # Fixed to a moderate value to control complexity
            n_jobs=-1,
            random_state=42,
            **params
        )
        clf.fit(X_tr_os, y_tr_os)

        val_proba = clf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, val_proba)
        aucs.append(auc)

    return np.mean(aucs)


rf_param_grid = [
    {"max_depth": 8, "min_samples_split": 10, "min_samples_leaf": 5, "max_features": "sqrt"},
    {"max_depth": 12, "min_samples_split": 10, "min_samples_leaf": 5, "max_features": "sqrt"},
    {"max_depth": 12, "min_samples_split": 20, "min_samples_leaf": 10, "max_features": "sqrt"},
    {"max_depth": None, "min_samples_split": 20, "min_samples_leaf": 10, "max_features": "sqrt"},
]

best_rf_auc = -1
best_rf_params = None

for params in rf_param_grid:
    mean_auc = cv_auc_rf(params, X_train, y_train)
    print(f"RF params={params}, mean AUC={mean_auc:.6f}")
    if mean_auc > best_rf_auc:
        best_rf_auc = mean_auc
        best_rf_params = params

print("\n>>> Best RF params:", best_rf_params)
print(">>> Best RF CV AUC:", best_rf_auc)


In [None]:
### C. Refit Best LR & RF on Full Training Set + Voting

# 1) Oversample the full training set for LR
X_lr_os, y_lr_os = oversample(X_train.copy(), y_train.values)

scaler_lr = StandardScaler()
X_lr_os_scaled = scaler_lr.fit_transform(X_lr_os)
X_test_lr_scaled = scaler_lr.transform(X_test_full)

best_lr_clf = LogisticRegression(
    max_iter=2000,
    solver="liblinear",
    **best_lr_params
)
best_lr_clf.fit(X_lr_os_scaled, y_lr_os)

pred_lr = best_lr_clf.predict_proba(X_test_lr_scaled)[:, 1]

# 2) Train RF on the oversampled full training set
X_rf_os, y_rf_os = oversample(X_train.copy(), y_train.values)

best_rf_clf = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    random_state=42,
    **best_rf_params
)
best_rf_clf.fit(X_rf_os, y_rf_os)

pred_rf = best_rf_clf.predict_proba(X_test_full)[:, 1]


In [None]:
### D. LR + RF Soft Voting
w_lr, w_rf = 1.0, 2.0

example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub["id"].astype(str)

proba_lr_aligned = (
    pd.Series(pred_lr, index=X_test_full.index)
    .loc[user_ids]
    .values
)

proba_rf_aligned = (
    pd.Series(pred_rf, index=X_test_full.index)
    .loc[user_ids]
    .values
)

voting_proba = (
    w_lr * proba_lr_aligned +
    w_rf * proba_rf_aligned
) / (w_lr + w_rf)

threshold = np.quantile(voting_proba, 0.5)
print("Best LR+RF voting top-50% threshold =", threshold)

pred_label = (voting_proba >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub["id"],
    "target": pred_label
})
print(submission["target"].value_counts(normalize=True))

submission.to_csv("submission_LR_RF_voting_gridsearch.csv", index=False)
print("Saved submission_LR_RF_voting_gridsearch.csv")