In [1]:
import numpy as np
import pandas as pd
import sys
import os

# Add project root to path
sys.path.append(os.path.abspath(".."))

# Import custom modules
from src import (
    load_data, 
    clean_data,
    label_churn,
    extract_seasonality,
    extract_user_attributes,
    extract_behavioral_flags,
    aggregate_session_metrics,
    aggregate_user_features
)

pd.set_option('display.max_columns', None)

## 1. Load & Clean Data

In [2]:
# Load Data
df = load_data('../data/train.parquet')

# Clean Data (Casting, Dropping Leakage/Redundant cols)
df = clean_data(df)

print(f"Shape after cleaning: {df.shape}")
df.head()

Shape after cleaning: (17499636, 15)


Unnamed: 0,status,gender,level,userId,ts,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,registration
0,200,M,paid,1749042,2018-10-01 00:00:01,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-08-08 13:22:21
992,200,M,paid,1749042,2018-10-01 00:08:45,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-08-08 13:22:21
1360,200,M,paid,1749042,2018-10-01 00:11:43,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-08-08 13:22:21
1825,200,M,paid,1749042,2018-10-01 00:15:35,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-08-08 13:22:21
2366,200,M,paid,1749042,2018-10-01 00:20:00,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-08-08 13:22:21


## 2. Define Target (Churn)

In [3]:
# Label Churn (10-day window)
# Note: label_churn is still useful for EDA, but aggregate_user_features handles target creation internally
# We will use it here just to inspect the event-level distribution if needed
df = label_churn(df, window_days=10)

# Check distribution
print(df['churn'].value_counts(normalize=True))

churn
0    0.896537
1    0.103463
Name: proportion, dtype: float64


## 3. Feature Extraction (Event-Level)

In [4]:
# 1. Seasonality
df = extract_seasonality(df)

# 2. User Attributes
df = extract_user_attributes(df)

# 3. Behavioral Flags
df = extract_behavioral_flags(df)

# 4. Session Metrics (Errors, Redirects)
df = aggregate_session_metrics(df)

print("Features extracted.")
df.head()

Features extracted.


Unnamed: 0,status,gender,level,userId,ts,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,registration,churn_ts,churn,hour,dayofweek,is_weekend,account_age_days,platform,state,thumbs_up,thumbs_down,roll_advert,downgrade,is_error,is_redirect
0,200,M,paid,1749042,2018-10-01 00:00:01,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.442824,Windows,TX,0,0,0,0,0,0
1,200,M,paid,1749042,2018-10-01 00:08:45,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.448889,Windows,TX,0,0,0,0,0,0
2,200,M,paid,1749042,2018-10-01 00:11:43,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.450949,Windows,TX,0,0,0,0,0,0
3,200,M,paid,1749042,2018-10-01 00:15:35,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.453634,Windows,TX,0,0,0,0,0,0
4,200,M,paid,1749042,2018-10-01 00:20:00,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.456701,Windows,TX,0,0,0,0,0,0


## 4. Aggregation: Event-Level to User-Level

We aggregate the data to have **one row per user**, including:
- **Rolling Window Features** (Last 1, 3, 7, 14, 30 days)
- **Trend Features** (7d vs 30d activity)
- **Session Quality** (Avg songs/session, Avg duration)
- **Diversity** (Unique artists/songs)

In [5]:
# Aggregate to User Level
user_df = aggregate_user_features(df)

print(f"User-level dataset shape: {user_df.shape}")
print("Columns:", user_df.columns.tolist())
user_df.head()

User-level dataset shape: (19140, 47)
Columns: ['gender', 'level', 'platform', 'is_thumbs_up', 'is_thumbs_down', 'is_ad', 'is_error', 'is_song', 'length', 'downgrade', 'songs_last_1d', 'errors_last_1d', 'listen_time_last_1d', 'unique_artists_last_1d', 'unique_songs_last_1d', 'songs_last_3d', 'errors_last_3d', 'listen_time_last_3d', 'unique_artists_last_3d', 'unique_songs_last_3d', 'songs_last_7d', 'errors_last_7d', 'listen_time_last_7d', 'unique_artists_last_7d', 'unique_songs_last_7d', 'songs_last_14d', 'errors_last_14d', 'listen_time_last_14d', 'unique_artists_last_14d', 'unique_songs_last_14d', 'songs_last_30d', 'errors_last_30d', 'listen_time_last_30d', 'unique_artists_last_30d', 'unique_songs_last_30d', 'account_lifetime', 'avg_songs_per_day', 'thumbs_ratio', 'errors_per_song', 'trend_songs_7d_vs_30d', 'trend_listen_time_7d_vs_30d', 'total_sessions', 'avg_days_between_sessions', 'avg_songs_per_session', 'avg_session_duration', 'target', 'state_freq']


Unnamed: 0_level_0,gender,level,platform,is_thumbs_up,is_thumbs_down,is_ad,is_error,is_song,length,downgrade,songs_last_1d,errors_last_1d,listen_time_last_1d,unique_artists_last_1d,unique_songs_last_1d,songs_last_3d,errors_last_3d,listen_time_last_3d,unique_artists_last_3d,unique_songs_last_3d,songs_last_7d,errors_last_7d,listen_time_last_7d,unique_artists_last_7d,unique_songs_last_7d,songs_last_14d,errors_last_14d,listen_time_last_14d,unique_artists_last_14d,unique_songs_last_14d,songs_last_30d,errors_last_30d,listen_time_last_30d,unique_artists_last_30d,unique_songs_last_30d,account_lifetime,avg_songs_per_day,thumbs_ratio,errors_per_song,trend_songs_7d_vs_30d,trend_listen_time_7d_vs_30d,total_sessions,avg_days_between_sessions,avg_songs_per_session,avg_session_duration,target,state_freq
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
1000025,M,paid,Windows,94,13,7,1,1662,417296.59169,0,212,0,52648.4459,197,208,535,1,132961.53374,440,502,687,1,173744.74643,552,645,1049,1,265573.95039,803,965,1662,1,417296.59169,1162,1468,100.460382,16.380778,0.878505,0.000602,1.653032,1.66543,17,5.909434,97.764706,24546.858335,1,0.012487
1000035,F,paid,Linux,117,15,6,1,1266,310364.8659,0,170,0,41213.98983,160,168,170,0,41213.98983,160,168,405,1,99705.547,347,388,574,1,140551.62871,472,544,1133,1,278412.10335,835,1042,63.350567,19.673486,0.886364,0.00079,1.429328,1.432487,21,3.016694,60.285714,14779.279329,0,0.010972
1000083,M,paid,Windows,21,2,8,0,501,122606.27093,0,213,0,52170.90103,203,211,250,0,61312.53977,236,247,406,0,100331.33604,358,391,501,0,122606.27093,427,478,501,0,122606.27093,427,478,34.668854,14.045868,0.913043,0.0,3.238931,3.273275,11,3.151714,45.545455,11146.02463,1,0.007524
1000103,F,paid,Linux,2,1,3,0,57,13554.73009,0,5,0,984.08263,5,5,5,0,984.08263,5,5,5,0,984.08263,5,5,18,0,3785.68327,18,18,18,0,3785.68327,18,18,47.459201,1.176247,0.666667,0.0,1.086957,1.039684,3,15.819734,19.0,4518.243363,0,0.031714
1000164,F,paid,Windows,38,6,20,1,847,209060.65753,0,184,0,42443.54099,173,183,216,0,50847.03993,202,215,313,0,75785.07918,286,306,479,1,117207.44962,400,450,513,1,126008.87092,426,480,99.1475,8.457525,0.863636,0.001181,2.438644,2.405699,15,6.609833,56.466667,13937.377169,0,0.021003


## 5. Final Cleanup & Save

The `aggregate_user_features` function automatically handles:
- Dropping raw timestamps (`registration`, `last_active`)
- **Frequency Encoding** the `state` column (replacing it with `state_freq`)
- Setting the target variable

In [6]:
# Fill NaNs if any (e.g. from aggregations)
user_df = user_df.fillna(0)

# Verify columns
print("Final Columns:", user_df.columns.tolist())

# Save
output_path = '../data/user_features.parquet'
user_df.to_parquet(output_path)
print(f"Saved processed data to {output_path}")

Final Columns: ['gender', 'level', 'platform', 'is_thumbs_up', 'is_thumbs_down', 'is_ad', 'is_error', 'is_song', 'length', 'downgrade', 'songs_last_1d', 'errors_last_1d', 'listen_time_last_1d', 'unique_artists_last_1d', 'unique_songs_last_1d', 'songs_last_3d', 'errors_last_3d', 'listen_time_last_3d', 'unique_artists_last_3d', 'unique_songs_last_3d', 'songs_last_7d', 'errors_last_7d', 'listen_time_last_7d', 'unique_artists_last_7d', 'unique_songs_last_7d', 'songs_last_14d', 'errors_last_14d', 'listen_time_last_14d', 'unique_artists_last_14d', 'unique_songs_last_14d', 'songs_last_30d', 'errors_last_30d', 'listen_time_last_30d', 'unique_artists_last_30d', 'unique_songs_last_30d', 'account_lifetime', 'avg_songs_per_day', 'thumbs_ratio', 'errors_per_song', 'trend_songs_7d_vs_30d', 'trend_listen_time_7d_vs_30d', 'total_sessions', 'avg_days_between_sessions', 'avg_songs_per_session', 'avg_session_duration', 'target', 'state_freq']
Saved processed data to ../data/user_features.parquet
