In [1]:
import pandas as pd
import numpy as np

## 1. Preprocess

In [2]:
# 1. Read raw parquet files
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

# 3. Convert time-related columns
train_df['ts'] = pd.to_datetime(train_df['ts'], unit='ms')
test_df['ts'] = pd.to_datetime(test_df['ts'], unit='ms')

train_df['registration'] = pd.to_datetime(train_df['registration'])
test_df['registration'] = pd.to_datetime(test_df['registration'])

train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# 4. Clean the 'page' field (strip spaces)
train_df['page'] = train_df['page'].astype(str).str.strip()
test_df['page'] = test_df['page'].astype(str).str.strip()

# 5. Extract state from location (last two characters) as 'state'
#    e.g., "New York, NY" -> "NY"
train_df['location'] = train_df['location'].astype(str).str.strip()
test_df['location'] = test_df['location'].astype(str).str.strip()

train_df['state'] = train_df['location'].str[-2:]
test_df['state'] = test_df['location'].str[-2:]

# 6. Sort by userId + ts to ensure chronological order
train_df = train_df.sort_values(['userId', 'ts'])
test_df = test_df.sort_values(['userId', 'ts'])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (17499636, 20)
Test shape: (4393179, 20)


## 2. Window label

In [3]:
# 1. Set sliding-window parameters
horizon_days = 10  # prediction window length = 10 days (competition setting)

# Manually set the earliest cutoff start date
cutoff_start = pd.to_datetime("2018-10-01")

# Cutoff end must satisfy: cutoff + 10 days <= train_df['ts'].max().normalize()
# Take max date (daily), then subtract horizon_days
max_ts_date = train_df['ts'].max().normalize()          # usually 2018-11-20
cutoff_end = max_ts_date - pd.Timedelta(days=horizon_days + 1)

print("cutoff_start:", cutoff_start.date())
print("max_ts_date :", max_ts_date.date())
print("cutoff_end  :", cutoff_end.date())

cutoff_start: 2018-10-01
max_ts_date : 2018-11-20
cutoff_end  : 2018-11-09


In [4]:
# Generate a list of cutoffs: one cutoff per day
cutoff_dates = pd.date_range(start=cutoff_start, end=cutoff_end, freq="D")
print("cutoff_dates count:", len(cutoff_dates))
print("cutoff_dates preview:", list(cutoff_dates[:5]), "...", list(cutoff_dates[-5:]))

cutoff_dates count: 40
cutoff_dates preview: [Timestamp('2018-10-01 00:00:00'), Timestamp('2018-10-02 00:00:00'), Timestamp('2018-10-03 00:00:00'), Timestamp('2018-10-04 00:00:00'), Timestamp('2018-10-05 00:00:00')] ... [Timestamp('2018-11-05 00:00:00'), Timestamp('2018-11-06 00:00:00'), Timestamp('2018-11-07 00:00:00'), Timestamp('2018-11-08 00:00:00'), Timestamp('2018-11-09 00:00:00')]


In [5]:
# 2. For each cutoff_date, generate samples (userId, cutoff_date, target)

# First compute each user's first churn timestamp (first time they hit Cancellation Confirmation)
first_churn_ts = (
    train_df[train_df['page'] == "Cancellation Confirmation"]
    .groupby('userId')['ts']
    .min()
)

print("Number of users who churned at least once:", len(first_churn_ts))

Number of users who churned at least once: 4271


In [6]:
all_samples_list = []

for cutoff_date in cutoff_dates:
    print("\n==============================")
    print("Current cutoff_date =", cutoff_date.date())
    
    # ---- 2.1 Observation window & prediction window ----
    # Observation window: ts <= cutoff_date
    obs_mask = (train_df['ts'] <= cutoff_date)
    
    # Prediction window: cutoff_date < ts <= cutoff_date + horizon_days
    future_end = cutoff_date + pd.Timedelta(days=horizon_days)
    fut_mask = (
        (train_df['ts'] > cutoff_date) &
        (train_df['ts'] <= future_end)
    )
    
    # ---- 2.2 Users observed within the observation window ----
    users_obs = train_df.loc[obs_mask, 'userId'].unique()
    users_obs = np.sort(users_obs)
    print("Users in observation window (including churned):", len(users_obs))
    
    if len(users_obs) == 0:
        print("No observed users for this cutoff; skip")
        continue

    # ---- 2.2.1 Filter out users who already churned ----
    # Reindex first_churn_ts by users_obs to align:
    #   index = users_obs, value = first churn ts or NaT
    churn_ts_sub = first_churn_ts.reindex(users_obs)
    
    # Keep conditions:
    #   - churn_ts is NaT -> never churned -> keep
    #   - churn_ts > cutoff_date -> churn happens in the future -> keep
    #   - churn_ts <= cutoff_date -> already churned (or churned on cutoff day) -> drop
    alive_mask = (churn_ts_sub.isna()) | (churn_ts_sub > cutoff_date)
    alive_users = users_obs[alive_mask.values]
    
    print("Alive users after filtering already-churned:", len(alive_users))
    
    if len(alive_users) == 0:
        print("No alive users for this cutoff; skip")
        continue
    
    # ---- 2.3 Users who hit 'Cancellation Confirmation' in the prediction window ----
    cc_future_users = (
        train_df.loc[fut_mask & (train_df['page'] == "Cancellation Confirmation"), 'userId']
        .unique()
    )
    
    # Keep only users who are alive and observed
    cc_future_users = np.intersect1d(cc_future_users, alive_users)
    print("Alive users who churn in prediction window (target=1):", len(cc_future_users))
    
    # ---- 2.4 Build labels for current cutoff ----
    # alive_users defines the sample user list for this cutoff
    # Default target=0; set target=1 for users in cc_future_users
    y_array = np.zeros(len(alive_users), dtype=int)
    pos_mask = np.isin(alive_users, cc_future_users)
    y_array[pos_mask] = 1
    
    # ---- 2.5 Create a DataFrame for this cutoff ----
    tmp = pd.DataFrame({
        "userId": alive_users,
        "cutoff_date": cutoff_date,   # same value for all rows in this batch
        "target": y_array,
    })
    
    print("Samples at this cutoff:", len(tmp),
          "  Positives:", tmp['target'].sum(),
          "  Positive rate:", tmp['target'].mean())
    
    all_samples_list.append(tmp)


Current cutoff_date = 2018-10-01
Users in observation window (including churned): 0
No observed users for this cutoff; skip

Current cutoff_date = 2018-10-02
Users in observation window (including churned): 5261
Alive users after filtering already-churned: 5144
Alive users who churn in prediction window (target=1): 494
Samples at this cutoff: 5144   Positives: 494   Positive rate: 0.09603421461897356

Current cutoff_date = 2018-10-03
Users in observation window (including churned): 8150
Alive users after filtering already-churned: 7904
Alive users who churn in prediction window (target=1): 660
Samples at this cutoff: 7904   Positives: 660   Positive rate: 0.08350202429149797

Current cutoff_date = 2018-10-04
Users in observation window (including churned): 10115
Alive users after filtering already-churned: 9740
Alive users who churn in prediction window (target=1): 715
Samples at this cutoff: 9740   Positives: 715   Positive rate: 0.07340862422997947

Current cutoff_date = 2018-10-05


In [7]:
# 3. Concatenate all cutoff samples into the full sliding-window label table

sliding_labels = pd.concat(all_samples_list, ignore_index=True)

print("\n==============================")
print("Overall after merging all cutoffs:")
print("Total samples:", len(sliding_labels))
print("Total positives:", sliding_labels['target'].sum())
print("Overall positive rate:", sliding_labels['target'].mean())
print("\nTarget distribution:")
print(sliding_labels['target'].value_counts(normalize=True))


Overall after merging all cutoffs:
Total samples: 556930
Total positives: 30052
Overall positive rate: 0.05396010270590559

Target distribution:
target
0    0.94604
1    0.05396
Name: proportion, dtype: float64


## 3. Training set features

### 3.1 Category features

In [8]:
# Static user-level info (train_df is sufficient)
user_static = (
    train_df
    .sort_values('ts')
    .groupby('userId')
    .agg({
        'gender': 'first',   # or 'last' is fine; gender does not change
        'state':  'first',   # extracted from location
    })
)

# Map into sliding_labels
sliding_labels['gender'] = sliding_labels['userId'].map(user_static['gender'])
sliding_labels['state']  = sliding_labels['userId'].map(user_static['state'])

print("sliding_labels with gender/state:")
print(sliding_labels[['userId', 'cutoff_date', 'target', 'gender', 'state']].head())

sliding_labels with gender/state:
    userId cutoff_date  target gender state
0  1000083  2018-10-02       0      M    IN
1  1000164  2018-10-02       0      F    AZ
2  1000280  2018-10-02       0      M    OH
3  1000353  2018-10-02       0      F    TX
4  1000407  2018-10-02       0      M    CO


### 3.2 Multi-Index

In [9]:
# Use userId_cutoff-date as MultiIndex (required)
sliding_labels['sample_id'] = (
    sliding_labels['userId'].astype(str)
    + "_" +
    sliding_labels['cutoff_date'].astype(str)
)

# Set index
sliding_labels = sliding_labels.set_index('sample_id')

# Target y_all
y_all = sliding_labels['target']

# Cutoff timestamps (direct reference)
cutoff_ts_all = sliding_labels['cutoff_date']

print("sliding_labels head:")
print(sliding_labels.head())
print("y_all head:")
print(y_all.head())
print("cutoff_ts_all head:")
print(cutoff_ts_all.head())

sliding_labels head:
                     userId cutoff_date  target gender state
sample_id                                                   
1000083_2018-10-02  1000083  2018-10-02       0      M    IN
1000164_2018-10-02  1000164  2018-10-02       0      F    AZ
1000280_2018-10-02  1000280  2018-10-02       0      M    OH
1000353_2018-10-02  1000353  2018-10-02       0      F    TX
1000407_2018-10-02  1000407  2018-10-02       0      M    CO
y_all head:
sample_id
1000083_2018-10-02    0
1000164_2018-10-02    0
1000280_2018-10-02    0
1000353_2018-10-02    0
1000407_2018-10-02    0
Name: target, dtype: int64
cutoff_ts_all head:
sample_id
1000083_2018-10-02   2018-10-02
1000164_2018-10-02   2018-10-02
1000280_2018-10-02   2018-10-02
1000353_2018-10-02   2018-10-02
1000407_2018-10-02   2018-10-02
Name: cutoff_date, dtype: datetime64[ns]


### 3.3 Lifetime

In [10]:
# Each user's registration timestamp
uid_registration = (
    train_df
    .sort_values('ts')
    .groupby('userId')['registration']
    .first()
)

# Map into sliding_labels
sliding_labels['registration_ts'] = sliding_labels['userId'].map(uid_registration)

# Days from registration to cutoff
days_since_registration = (
    (sliding_labels['cutoff_date'] - sliding_labels['registration_ts'])
    / np.timedelta64(1, 'D')
).astype('float32')

# Clip negative values (rare: abnormal registration timestamp)
days_since_registration = days_since_registration.clip(lower=0)

print("days_since_registration example:")
print(days_since_registration.head())

days_since_registration example:
sample_id
1000083_2018-10-02     24.248739
1000164_2018-10-02     50.602768
1000280_2018-10-02     34.345612
1000353_2018-10-02    152.515518
1000407_2018-10-02     12.492084
dtype: float32


### 3.4 Behaviors

In [11]:
# Ensure cutoff_date is datetime
sliding_labels['cutoff_date'] = pd.to_datetime(sliding_labels['cutoff_date'])

In [12]:
# 1) Sort train_df by userId + ts to ensure per-user order
train_df_sorted = train_df.sort_values(['userId', 'ts']).copy()

# 2) Group behavior data by user (train side)
train_groups = dict(tuple(train_df_sorted.groupby('userId')))

# 3) Group label samples by user (sliding_labels index is sample_id)
label_groups = dict(tuple(sliding_labels.groupby('userId')))  # user -> label table

print("Number of users (train):", len(train_groups))
print("Number of users (sliding_labels):", len(label_groups))  # users appearing only after 2018-11-10 may be missing

Number of users (train): 19140
Number of users (sliding_labels): 18442


In [13]:
# Initialize series for behavior counts

# Initialize: index = sample_id (sliding_labels.index), all zeros
n_events = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    # Skip if user has no behavior data in train
    if uid not in train_groups:
        continue

    # All events for this user (already sorted by ts)
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values  # datetime64[ns] array

    # All samples for this user in sliding_labels
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # searchsorted: for each cutoff, find number of events <= cutoff
    pos = np.searchsorted(ts_vals, cutoffs, side='right')

    # Write back into n_events (aligned by sample_id)
    n_events.loc[sample_ids] = pos.astype('int32')

print("n_events computed")
print(n_events.head())

n_events computed
sample_id
1000083_2018-10-02    14
1000164_2018-10-02     2
1000280_2018-10-02    37
1000353_2018-10-02    97
1000407_2018-10-02     5
dtype: int32


### 3.5 recency_hours

In [14]:
# Formula: cutoff_date - last event timestamp

# Initialize recency_hours (hours)
recency_hours = pd.Series(np.nan, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values  # sorted datetime64 array
    
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # hi[j] = number of events <= cutoff
    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    
    # hi==0 means no events <= cutoff; set recency to a large value (e.g., 9999 hours)
    # hi>0 means last event index is hi-1
    for idx_in_uid, sample_id in enumerate(sample_ids):
        h = hi[idx_in_uid]
        
        if h == 0:
            # No event before cutoff -> very large recency (9999 hours ~ 416 days)
            recency_hours.loc[sample_id] = 9999.0
        else:
            last_ts = ts_vals[h-1]
            delta = (cutoffs[idx_in_uid] - last_ts)
            recency_hours.loc[sample_id] = delta / np.timedelta64(1, 'h')

print("recency_hours computed")
print(recency_hours.head())

  recency_hours.loc[sample_id] = delta / np.timedelta64(1, 'h')


recency_hours computed
sample_id
1000083_2018-10-02    12.570556
1000164_2018-10-02     6.302222
1000280_2018-10-02     1.177222
1000353_2018-10-02    18.474167
1000407_2018-10-02    13.384167
dtype: float64


### 3.6 events_last_7d

In [15]:
events_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')

seven_days = np.timedelta64(7, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # hi = number of events <= cutoff
    hi = np.searchsorted(ts_vals, cutoffs, side='right')

    # lo = number of events <= cutoff - 7 days
    window_starts = cutoffs - seven_days
    lo = np.searchsorted(ts_vals, window_starts, side='right')
    
    # Events in last 7 days = hi - lo
    cnt_7d = hi - lo
    events_last_7d.loc[sample_ids] = cnt_7d.astype('int32')

print("events_last_7d computed")
print(events_last_7d.head())

events_last_7d computed
sample_id
1000083_2018-10-02    14
1000164_2018-10-02     2
1000280_2018-10-02    37
1000353_2018-10-02    97
1000407_2018-10-02     5
dtype: int32


### 3.7 songs_last_7d

In [16]:
songs_last_7d = pd.Series(0, index=sliding_labels.index, dtype='int32')

seven_days = np.timedelta64(7, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Filter song-play events (NextSong)
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        # User never listened to songs -> all zeros
        continue
    
    ts_song_vals = df_song['ts'].values  # sorted datetime array

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi = number of songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # lo = number of songs <= cutoff - 7 days
    window_starts = cutoffs - seven_days
    lo = np.searchsorted(ts_song_vals, window_starts, side='right')

    cnt_7d = hi - lo
    songs_last_7d.loc[sample_ids] = cnt_7d.astype('int32')

print("songs_last_7d computed")
print(songs_last_7d.head())

songs_last_7d computed
sample_id
1000083_2018-10-02    11
1000164_2018-10-02     1
1000280_2018-10-02    24
1000353_2018-10-02    75
1000407_2018-10-02     3
dtype: int32


### 3.8 active_days

In [17]:
active_days = pd.Series(0, index=sliding_labels.index, dtype='int32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Extract dates only (sorted by ts)
    dates_u = df_u['ts'].dt.normalize().values

    # Unique dates (sorted)
    unique_days = np.unique(dates_u)
    
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # searchsorted on date array: count dates <= cutoff
    hi = np.searchsorted(unique_days, cutoffs, side='right')

    # hi[j] is the active day count for this sample
    active_days.loc[sample_ids] = hi.astype('int32')

print("active_days computed")
print(active_days.head())

active_days computed
sample_id
1000083_2018-10-02    2
1000164_2018-10-02    1
1000280_2018-10-02    1
1000353_2018-10-02    2
1000407_2018-10-02    2
dtype: int32


### 3.9 total_listen_time

In [18]:
total_listen_time = pd.Series(0.0, index=sliding_labels.index, dtype='float32')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    # Filter NextSong events
    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song_vals = df_song['ts'].values
    len_song_vals = df_song['length'].values

    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi = number of songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # Prefix sum of song lengths for fast cumulative lookup
    cum_len = np.cumsum(len_song_vals)

    for j, sample_id in enumerate(sample_ids):
        h = hi[j]
        if h == 0:
            total_listen_time.loc[sample_id] = 0.0
        else:
            total_listen_time.loc[sample_id] = float(cum_len[h-1])

print("total_listen_time computed")
print(total_listen_time.head())

  total_listen_time.loc[sample_id] = float(cum_len[h-1])


total_listen_time computed
sample_id
1000083_2018-10-02     2312.40644
1000164_2018-10-02      227.42159
1000280_2018-10-02     6200.23076
1000353_2018-10-02    18886.05248
1000407_2018-10-02      804.98803
dtype: float64


### 3.10 events_last_1d/3d

In [19]:
events_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')
events_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')

one_day   = np.timedelta64(1, 'D')
three_days = np.timedelta64(3, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals = df_u['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi: number of events <= cutoff (same as n_events)
    hi = np.searchsorted(ts_vals, cutoffs, side='right')

    # ---- last 1 day ----
    window_start_1d = cutoffs - one_day
    lo_1d = np.searchsorted(ts_vals, window_start_1d, side='right')
    cnt_1d = hi - lo_1d
    events_last_1d.loc[sample_ids] = cnt_1d.astype('int32')

    # ---- last 3 days ----
    window_start_3d = cutoffs - three_days
    lo_3d = np.searchsorted(ts_vals, window_start_3d, side='right')
    cnt_3d = hi - lo_3d
    events_last_3d.loc[sample_ids] = cnt_3d.astype('int32')

print("events_last_1d / 3d computed")
print(events_last_1d.head())
print(events_last_3d.head())

events_last_1d / 3d computed
sample_id
1000083_2018-10-02    14
1000164_2018-10-02     2
1000280_2018-10-02    37
1000353_2018-10-02    97
1000407_2018-10-02     5
dtype: int32
sample_id
1000083_2018-10-02    14
1000164_2018-10-02     2
1000280_2018-10-02    37
1000353_2018-10-02    97
1000407_2018-10-02     5
dtype: int32


### 3.11 songs_last_1d/3d

In [20]:
songs_last_1d = pd.Series(0, index=sliding_labels.index, dtype='int32')
songs_last_3d = pd.Series(0, index=sliding_labels.index, dtype='int32')

one_day    = np.timedelta64(1, 'D')
three_days = np.timedelta64(3, 'D')

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]

    df_song = df_u[df_u['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song_vals = df_song['ts'].values
    cutoffs = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index

    # hi: number of songs <= cutoff
    hi = np.searchsorted(ts_song_vals, cutoffs, side='right')

    # ---- last 1 day ----
    window_start_1d = cutoffs - one_day
    lo_1d = np.searchsorted(ts_song_vals, window_start_1d, side='right')
    cnt_1d = hi - lo_1d
    songs_last_1d.loc[sample_ids] = cnt_1d.astype('int32')

    # ---- last 3 days ----
    window_start_3d = cutoffs - three_days
    lo_3d = np.searchsorted(ts_song_vals, window_start_3d, side='right')
    cnt_3d = hi - lo_3d
    songs_last_3d.loc[sample_ids] = cnt_3d.astype('int32')

print("songs_last_1d / 3d computed")
print(songs_last_1d.head())
print(songs_last_3d.head())

songs_last_1d / 3d computed
sample_id
1000083_2018-10-02    11
1000164_2018-10-02     1
1000280_2018-10-02    24
1000353_2018-10-02    75
1000407_2018-10-02     3
dtype: int32
sample_id
1000083_2018-10-02    11
1000164_2018-10-02     1
1000280_2018-10-02    24
1000353_2018-10-02    75
1000407_2018-10-02     3
dtype: int32


### 3.12 Level-at-cutoff

In [21]:
level_at_cutoff = pd.Series("unknown", index=sliding_labels.index, dtype=object)

for uid, lbl_u in label_groups.items():
    if uid not in train_groups:
        continue
    
    df_u = train_groups[uid]
    ts_vals   = df_u['ts'].values
    lvl_vals  = df_u['level'].astype(str).values   # level sequence (free/paid)
    
    cutoffs    = lbl_u['cutoff_date'].values
    sample_ids = lbl_u.index
    
    # hi[j] = number of events <= cutoff
    hi = np.searchsorted(ts_vals, cutoffs, side='right')
    
    # If hi[j] == 0, user has no event before cutoff -> unknown
    # Otherwise, the last level is lvl_vals[hi[j] - 1]
    lvl_for_samples = []
    for j, h in enumerate(hi):
        if h == 0:
            lvl_for_samples.append("unknown")
        else:
            lvl_for_samples.append(lvl_vals[h-1])
    
    level_at_cutoff.loc[sample_ids] = lvl_for_samples
    
print("Level-at-cutoff computed")
print(level_at_cutoff.head())

Level-at-cutoff computed
sample_id
1000083_2018-10-02    free
1000164_2018-10-02    free
1000280_2018-10-02    free
1000353_2018-10-02    paid
1000407_2018-10-02    free
dtype: object


In [22]:
# Write back to sliding_labels for later one-hot encoding
sliding_labels['level'] = level_at_cutoff

### 3.13 Combining features

In [23]:
# Empty feature table; ensure X_all index is sample_id
X_all = pd.DataFrame(index=sliding_labels.index)

# 1) Lifetime features
X_all['days_since_registration'] = days_since_registration

# 2) Cumulative event count
X_all['n_events'] = n_events.astype('int32')

# 3) Time since last event (hours)
X_all['recency_hours'] = recency_hours.astype('float32')

# 4) Events / songs in last 7 days
X_all['events_last_7d'] = events_last_7d.astype('int32')
X_all['songs_last_7d']  = songs_last_7d.astype('int32')

# 5) Active days (cumulative)
X_all['active_days'] = active_days.astype('int32')

# 6) Total listening time (cumulative)
X_all['total_listen_time'] = total_listen_time.astype('float32')

# 7) Events in last 1 day / 3 days
X_all['events_last_1d'] = events_last_1d.astype('int32')
X_all['events_last_3d'] = events_last_3d.astype('int32')

# 8) Songs in last 1 day / 3 days
X_all['songs_last_1d'] = songs_last_1d.astype('int32')
X_all['songs_last_3d'] = songs_last_3d.astype('int32')

print("Numeric features merged into X_all")
print("X_all shape:", X_all.shape)
print(X_all.head())
print("\nX_all dtypes:")
print(X_all.dtypes)

Numeric features merged into X_all
X_all shape: (556930, 11)
                    days_since_registration  n_events  recency_hours  \
sample_id                                                              
1000083_2018-10-02                24.248739        14      12.570556   
1000164_2018-10-02                50.602768         2       6.302222   
1000280_2018-10-02                34.345612        37       1.177222   
1000353_2018-10-02               152.515518        97      18.474167   
1000407_2018-10-02                12.492084         5      13.384167   

                    events_last_7d  songs_last_7d  active_days  \
sample_id                                                        
1000083_2018-10-02              14             11            2   
1000164_2018-10-02               2              1            1   
1000280_2018-10-02              37             24            1   
1000353_2018-10-02              97             75            2   
1000407_2018-10-02               5    

### 3.14 One-hot

In [24]:
# One-hot encode categorical features directly from sliding_labels
cat_cols = ['gender', 'state', 'level']
sliding_labels[cat_cols] = sliding_labels[cat_cols].fillna("missing")

cat_ohe = pd.get_dummies(
    sliding_labels[cat_cols],
    columns=cat_cols,
    prefix=cat_cols
)

print("One-hot categorical feature example:")
print(cat_ohe.head())

# Merge numeric + categorical
X_train_raw = pd.concat([X_all, cat_ohe], axis=1)

print("X_train shape:", X_train_raw.shape)
print("X_train example:")
print(X_train_raw.head())

# y_train
y_train_raw = sliding_labels['target']

One-hot categorical feature example:
                    gender_F  gender_M  state_AK  state_AL  state_AR  \
sample_id                                                              
1000083_2018-10-02     False      True     False     False     False   
1000164_2018-10-02      True     False     False     False     False   
1000280_2018-10-02     False      True     False     False     False   
1000353_2018-10-02      True     False     False     False     False   
1000407_2018-10-02     False      True     False     False     False   

                    state_AZ  state_CA  state_CO  state_CT  state_DE  ...  \
sample_id                                                             ...   
1000083_2018-10-02     False     False     False     False     False  ...   
1000164_2018-10-02      True     False     False     False     False  ...   
1000280_2018-10-02     False     False     False     False     False  ...   
1000353_2018-10-02     False     False     False     False     False  ...

### 3.15 drop_duplicates

In [25]:
train_for_dedup = X_train_raw.copy()
train_for_dedup['target'] = y_train_raw.values

train_dedup = train_for_dedup.drop_duplicates()

X_train = train_dedup.drop(columns=['target'])
y_train = train_dedup['target']

In [26]:
X_train.head(100)

Unnamed: 0_level_0,days_since_registration,n_events,recency_hours,events_last_7d,songs_last_7d,active_days,total_listen_time,events_last_1d,events_last_3d,songs_last_1d,...,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,level_free,level_paid
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000083_2018-10-02,24.248739,14,12.570556,14,11,2,2312.406494,14,14,11,...,False,False,False,False,False,False,False,False,True,False
1000164_2018-10-02,50.602768,2,6.302222,2,1,1,227.421585,2,2,1,...,False,False,False,False,False,False,False,False,True,False
1000280_2018-10-02,34.345612,37,1.177222,37,24,1,6200.230957,37,37,24,...,False,False,False,False,False,False,False,False,True,False
1000353_2018-10-02,152.515518,97,18.474167,97,75,2,18886.052734,97,97,75,...,True,False,False,False,False,False,False,False,False,True
1000407_2018-10-02,12.492084,5,13.384167,5,3,2,804.988037,5,5,3,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014528_2018-10-02,23.091064,21,18.725555,21,16,1,3484.074463,21,21,16,...,False,False,False,False,False,False,False,False,False,True
1015404_2018-10-02,81.659561,17,4.081944,17,10,1,2076.051025,17,17,10,...,False,False,False,False,False,False,False,False,True,False
1015508_2018-10-02,3.146308,16,12.596945,16,13,2,3862.067627,16,16,13,...,False,False,False,False,False,False,False,False,True,False
1015557_2018-10-02,23.717754,80,18.695278,80,62,2,14330.617188,80,80,62,...,False,False,False,False,False,False,False,False,True,False


In [27]:
y_train.head(100)

sample_id
1000083_2018-10-02    0
1000164_2018-10-02    0
1000280_2018-10-02    0
1000353_2018-10-02    0
1000407_2018-10-02    0
                     ..
1014528_2018-10-02    0
1015404_2018-10-02    0
1015508_2018-10-02    1
1015557_2018-10-02    0
1015607_2018-10-02    0
Name: target, Length: 100, dtype: int64

## 4. Test set features

In [28]:
# 4.1 Sort by userId + ts and group
test_df_sorted = test_df.sort_values(['userId', 'ts']).copy()
test_groups = dict(tuple(test_df_sorted.groupby('userId')))
test_users = sorted(test_groups.keys())

print("Number of test users:", len(test_users))

# 4.2 Global cutoff (observation end): last day in test
global_cutoff_test = test_df_sorted['ts'].max().normalize()
print("Test global cutoff_date:", global_cutoff_test)

# 4.3 Initialize test feature table: one row per user
X_test = pd.DataFrame(index=test_users)


Number of test users: 2904
Test global cutoff_date: 2018-11-20 00:00:00


### 4.1 Lifetime

In [29]:
# Registration time per user (from test)
uid_registration_test = (
    test_df_sorted
    .groupby('userId')['registration']
    .first()
)

# Days from registration to cutoff
days_since_registration_test = (
    (global_cutoff_test - uid_registration_test) / np.timedelta64(1, 'D')
).astype('float32')
days_since_registration_test = days_since_registration_test.clip(lower=0)

X_test['days_since_registration'] = days_since_registration_test.reindex(test_users)

print("days_since_registration_test example:")
print(X_test['days_since_registration'].head())


days_since_registration_test example:
1000655    66.504227
1000963    73.911598
1001129    86.574089
1001963    40.715820
1002283    54.104652
Name: days_since_registration, dtype: float32


### 4.2 n_events / recency_hours / active_days

In [30]:
n_events_test = pd.Series(0, index=test_users, dtype='int32')
recency_hours_test = pd.Series(0.0, index=test_users, dtype='float32')
active_days_test = pd.Series(0, index=test_users, dtype='int32')

for uid, df_u in test_groups.items():
    # Only consider events before cutoff
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    
    if df_before.empty:
        n_events_test.loc[uid] = 0
        recency_hours_test.loc[uid] = 9999.0
        active_days_test.loc[uid] = 0
        continue
    
    # Total event count
    n_events_test.loc[uid] = len(df_before)
    
    # recency_hours: cutoff - last event time
    last_ts = df_before['ts'].iloc[-1]
    delta_h = (global_cutoff_test - last_ts) / np.timedelta64(1, 'h')
    recency_hours_test.loc[uid] = float(delta_h)
    
    # Active days: number of unique dates
    active_days_test.loc[uid] = df_before['ts'].dt.normalize().nunique()

X_test['n_events'] = n_events_test
X_test['recency_hours'] = recency_hours_test
X_test['active_days'] = active_days_test

print("n_events / recency_hours / active_days example:")
print(X_test[['n_events', 'recency_hours', 'active_days']].head())


  recency_hours_test.loc[uid] = float(delta_h)


n_events / recency_hours / active_days example:
         n_events  recency_hours  active_days
1000655       346     101.752500           11
1000963      2539      47.861389           26
1001129       668      25.063889           10
1001963       718       3.341389           16
1002283      3837       5.711667           25


### 4.3 recent 7 / 3 / 1 behavior

In [31]:
events_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
events_last_1d_test = pd.Series(0, index=test_users, dtype='int32')

seven_days  = np.timedelta64(7, 'D')
three_days  = np.timedelta64(3, 'D')
one_day     = np.timedelta64(1, 'D')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    
    ts_vals = df_before['ts'].values
    
    # Last 7 days
    mask_7 = ts_vals > (global_cutoff_test - seven_days)
    events_last_7d_test.loc[uid] = int(mask_7.sum())
    
    # Last 3 days
    mask_3 = ts_vals > (global_cutoff_test - three_days)
    events_last_3d_test.loc[uid] = int(mask_3.sum())
    
    # Last 1 day
    mask_1 = ts_vals > (global_cutoff_test - one_day)
    events_last_1d_test.loc[uid] = int(mask_1.sum())

X_test['events_last_7d'] = events_last_7d_test
X_test['events_last_3d'] = events_last_3d_test
X_test['events_last_1d'] = events_last_1d_test

print("events_last_*_test example:")
print(X_test[['events_last_7d', 'events_last_3d', 'events_last_1d']].head())


events_last_*_test example:
         events_last_7d  events_last_3d  events_last_1d
1000655              33               0               0
1000963             472             259               0
1001129              62               2               0
1001963             326             159             159
1002283             303              89              89


### 4.4 recent 7 / 3 / 1 songs

In [32]:
songs_last_7d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_3d_test = pd.Series(0, index=test_users, dtype='int32')
songs_last_1d_test = pd.Series(0, index=test_users, dtype='int32')
total_listen_time_test = pd.Series(0.0, index=test_users, dtype='float32')

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    
    df_song = df_before[df_before['page'] == "NextSong"]
    if df_song.empty:
        continue
    
    ts_song = df_song['ts'].values
    len_song = df_song['length'].values
    
    # Total listening time (all songs <= cutoff)
    total_listen_time_test.loc[uid] = float(len_song.sum())
    
    # Songs in last 7 days
    mask_7 = ts_song > (global_cutoff_test - seven_days)
    songs_last_7d_test.loc[uid] = int(mask_7.sum())
    
    # Last 3 days
    mask_3 = ts_song > (global_cutoff_test - three_days)
    songs_last_3d_test.loc[uid] = int(mask_3.sum())
    
    # Last 1 day
    mask_1 = ts_song > (global_cutoff_test - one_day)
    songs_last_1d_test.loc[uid] = int(mask_1.sum())

X_test['songs_last_7d']     = songs_last_7d_test
X_test['songs_last_3d']     = songs_last_3d_test
X_test['songs_last_1d']     = songs_last_1d_test
X_test['total_listen_time'] = total_listen_time_test

print("songs_last_*_test & total_listen_time_test example:")
print(X_test[['songs_last_7d', 'songs_last_3d', 'songs_last_1d', 'total_listen_time']].head())


  total_listen_time_test.loc[uid] = float(len_song.sum())


songs_last_*_test & total_listen_time_test example:
         songs_last_7d  songs_last_3d  songs_last_1d  total_listen_time
1000655             24              0              0        65479.87470
1000963            402            221              0       526127.55185
1001129             46              1              0       139026.22059
1001963            250            123            123       134683.68053
1002283            250             77             77       789821.80947


### 4.5 Level-at-cutoff

In [33]:
level_at_cutoff_test = pd.Series("unknown", index=test_users, dtype=object)

for uid, df_u in test_groups.items():
    df_before = df_u[df_u['ts'] <= global_cutoff_test]
    if df_before.empty:
        continue
    level_at_cutoff_test.loc[uid] = str(df_before['level'].iloc[-1])

print("level_at_cutoff_test example:")
print(level_at_cutoff_test.head())


level_at_cutoff_test example:
1000655    free
1000963    paid
1001129    free
1001963    free
1002283    paid
dtype: object


### 4.6 Category + One-Hot

In [34]:
# Static gender/state in test
test_user_static = (
    test_df_sorted
    .groupby('userId')
    .agg({
        'gender': 'first',
        'state':  'first',
    })
)

cat_test = pd.DataFrame(index=test_users)
cat_test['gender'] = test_user_static['gender']
cat_test['state']  = test_user_static['state']
cat_test['level']  = level_at_cutoff_test

cat_test = cat_test.fillna("missing")

cat_test_ohe = pd.get_dummies(
    cat_test,
    columns=['gender', 'state', 'level'],
    prefix=['gender', 'state', 'level']
)

print("Test categorical one-hot example:")
print(cat_test_ohe.head())

# Align to train categorical columns
cat_test_ohe = cat_test_ohe.reindex(columns=cat_ohe.columns, fill_value=0)

print("Aligned cat_test_ohe shape:", cat_test_ohe.shape)

Test categorical one-hot example:
         gender_F  gender_M  gender_missing  state_AK  state_AL  state_AR  \
1000655      True     False           False     False     False     False   
1000963     False      True           False     False     False     False   
1001129     False      True           False     False     False     False   
1001963      True     False           False     False     False     False   
1002283     False      True           False     False     False     False   

         state_AZ  state_CA  state_CO  state_CT  ...  state_UT  state_VA  \
1000655     False     False     False     False  ...     False      True   
1000963     False     False     False     False  ...     False     False   
1001129     False     False     False     False  ...     False     False   
1001963     False     False     False     False  ...     False     False   
1002283     False     False     False     False  ...     False     False   

         state_VT  state_WA  state_WI  state_W

### 4.7 Combining features

In [35]:
# Final test features (align column order)
X_test_full = pd.concat([X_test, cat_test_ohe], axis=1)
X_test_full = X_test_full.reindex(columns=X_train.columns, fill_value=0)

print("X_test_full shape:", X_test_full.shape)
print(X_test_full.head())

X_test_full shape: (2904, 64)
         days_since_registration  n_events  recency_hours  events_last_7d  \
1000655                66.504227       346     101.752500              33   
1000963                73.911598      2539      47.861389             472   
1001129                86.574089       668      25.063889              62   
1001963                40.715820       718       3.341389             326   
1002283                54.104652      3837       5.711667             303   

         songs_last_7d  active_days  total_listen_time  events_last_1d  \
1000655             24           11        65479.87470               0   
1000963            402           26       526127.55185               0   
1001129             46           10       139026.22059               0   
1001963            250           16       134683.68053             159   
1002283            250           25       789821.80947              89   

         events_last_3d  songs_last_1d  ...  state_TX  state_U

In [36]:
X_test_full.head(100)

Unnamed: 0,days_since_registration,n_events,recency_hours,events_last_7d,songs_last_7d,active_days,total_listen_time,events_last_1d,events_last_3d,songs_last_1d,...,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,level_free,level_paid
1000655,66.504227,346,101.752500,33,24,11,65479.87470,0,0,0,...,False,False,True,False,False,False,False,False,True,False
1000963,73.911598,2539,47.861389,472,402,26,526127.55185,0,259,0,...,False,False,False,False,False,False,False,False,False,True
1001129,86.574089,668,25.063889,62,46,10,139026.22059,0,2,0,...,False,False,False,False,False,False,True,False,True,False
1001963,40.715820,718,3.341389,326,250,16,134683.68053,159,159,123,...,False,False,False,False,False,False,False,False,True,False
1002283,54.104652,3837,5.711667,303,250,25,789821.80947,89,89,77,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037874,56.995277,1614,80.578333,224,155,22,317178.30474,0,0,0,...,False,False,False,False,False,False,False,False,True,False
1038109,59.933205,2437,174.817222,0,0,15,516133.09474,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1038258,67.642838,1043,104.052778,60,50,13,210747.90598,0,0,0,...,True,False,False,False,False,False,False,False,False,True
1038741,79.969872,39,656.347500,0,0,1,5773.78177,0,0,0,...,False,False,False,False,False,False,False,False,True,False


## 5. Models and submission

In [37]:
# Simple oversampling utility
def oversample(X, y):
    X = X.copy()
    X['target'] = y
    major = X[X['target'] == 0]
    minor = X[X['target'] == 1]

    if len(minor) == 0:
        raise ValueError("No positive samples")

    ratio = max(1, len(major) // len(minor))
    minor_ov = pd.concat([minor] * ratio, ignore_index=True)

    df_new = pd.concat([major, minor_ov], axis=0).sample(frac=1.0, random_state=42)
    y_new = df_new['target'].values
    X_new = df_new.drop(columns=['target'])

    print("Positive rate after oversampling:", y_new.mean())
    return X_new, y_new

### 5.1 Light GBM (0.621)

In [38]:
import lightgbm as lgb

# 1) No oversampling
X_lgb, y_lgb = X_train, y_train

# 2) Model
lgb_clf = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    # num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    # max_depth=-1,
    random_state=42,
    max_depth=7,          # added
    num_leaves=32,        # added
    min_data_in_leaf=50,  # added
    feature_fraction=0.8, # added
    bagging_fraction=0.8, # added
    bagging_freq=5,       # added
)

lgb_clf.fit(X_lgb, y_lgb)

# 3) Predict probabilities
pred_lgb = lgb_clf.predict_proba(X_test_full)[:, 1]

# 4) Align order with example_submission
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_aligned = pd.Series(pred_lgb, index=X_test_full.index).loc[user_ids].values

# 5) Top 50% rule
threshold = np.quantile(proba_aligned, 0.5)
pred_label = (proba_aligned >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub["id"],
    "target": pred_label
})

print(submission['target'].value_counts(normalize=True))
submission.to_csv("submission_LightGBM.csv", index=False)
print("Saved submission_LightGBM.csv")

[LightGBM] [Info] Number of positive: 30052, number of negative: 526878
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2697
[LightGBM] [Info] Number of data points in the train set: 556930, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.053960 -> initscore=-2.864040
[LightGBM] [Info] Start training from score -2.864040
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_LightGBM.csv


### 5.2 Logistic Regression (0.624)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# 1) Oversample
X_lr, y_lr = oversample(X_train, y_train)

# 2) Standardize
scaler_lr = StandardScaler()
X_lr_scaled = scaler_lr.fit_transform(X_lr)
X_test_lr_scaled = scaler_lr.transform(X_test_full)

# 3) Train model
lr_clf = LogisticRegression(
    C=0.1,          # added: smaller than default 1.0 to reduce overfitting
    penalty='l2',   # added
    solver='liblinear',
    class_weight='balanced',
    max_iter=2000   # added
)
lr_clf.fit(X_lr_scaled, y_lr)

# 4) Predict probabilities
pred_lr = lr_clf.predict_proba(X_test_lr_scaled)[:, 1]

# ========== Top 50% submission ==========
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_align = pd.Series(pred_lr, index=X_test_full.index).loc[user_ids].values

threshold = np.quantile(proba_align, 0.5)
print("LR top50 threshold =", threshold)

pred_label = (proba_align >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub['id'],
    "target": pred_label
})
print(submission['target'].value_counts(normalize=True))

submission.to_csv("submission_LR.csv", index=False)
print("Saved submission_LR.csv")

Positive rate after oversampling: 0.4922939941913464
LR top50 threshold = 0.5464098247171293
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_LR.csv


### 5.3 ExtraTrees (0.610)

In [40]:
from sklearn.ensemble import ExtraTreesClassifier

# 1) No oversampling for now
X_et, y_et = X_train, y_train

# 2) ExtraTrees model
et_clf = ExtraTreesClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
et_clf.fit(X_et, y_et)

# 3) Predict probabilities
pred_et = et_clf.predict_proba(X_test_full)[:, 1]

# ========== Top 50% ==========
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_align = pd.Series(pred_et, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_align, 0.5)
print("ET top50 threshold =", threshold)

pred_label = (proba_align >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub['id'],
    "target": pred_label
})
print(submission['target'].value_counts(normalize=True))

submission.to_csv("submission_ET.csv", index=False)
print("Saved submission_ET.csv")

ET top50 threshold = 0.07022361197032842
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_ET.csv


### 5.4 KNN (0.56)

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 1) Oversample
X_knn, y_knn = oversample(X_train, y_train)

# 2) Standardize
scaler_knn = StandardScaler()
X_knn_scaled = scaler_knn.fit_transform(X_knn)
X_test_knn_scaled = scaler_knn.transform(X_test_full)

# 3) KNN
knn_clf = KNeighborsClassifier(
    n_neighbors=100,
    weights='distance',
    p=2,
    n_jobs=-1
)
knn_clf.fit(X_knn_scaled, y_knn)

# 4) Predict probabilities
pred_knn = knn_clf.predict_proba(X_test_knn_scaled)[:, 1]

# ========== Top 50% ==========
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_align = pd.Series(pred_knn, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_align, 0.5)
print("KNN top50 threshold =", threshold)

pred_label = (proba_align >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub['id'],
    "target": pred_label
})
print(submission['target'].value_counts(normalize=True))

submission.to_csv("submission_KNN.csv", index=False)
print("Saved submission_KNN.csv")

Positive rate after oversampling: 0.4922939941913464
KNN top50 threshold = 0.4883727717931538
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_KNN.csv


### 5.5 RF (0.610)

In [42]:
from sklearn.ensemble import RandomForestClassifier

# 1) No oversampling for now
X_rf, y_rf = X_train, y_train

# 2) Train RF
rf_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=7,  # modified
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_rf, y_rf)

# 3) Predict probabilities
pred_rf = rf_clf.predict_proba(X_test_full)[:, 1]

# ========== Top 50% ==========
example_sub = pd.read_csv("example_submission.csv")
user_ids = example_sub['id'].astype(str)

proba_align = pd.Series(pred_rf, index=X_test_full.index).loc[user_ids].values
threshold = np.quantile(proba_align, 0.5)
print("RF top50 threshold =", threshold)

pred_label = (proba_align >= threshold).astype(int)

submission = pd.DataFrame({
    "id": example_sub['id'],
    "target": pred_label
})
print(submission['target'].value_counts(normalize=True))

submission.to_csv("submission_RF.csv", index=False)
print("Saved submission_RF.csv")

RF top50 threshold = 0.0700059917151174
target
0    0.5
1    0.5
Name: proportion, dtype: float64
Saved submission_RF.csv
