In [1]:
import numpy as np
import pandas as pd
import sys
import os

# Add project root to path
sys.path.append(os.path.abspath(".."))

# Import custom modules
from src import (
    load_data, 
    clean_data,
    label_churn,
    extract_seasonality,
    extract_user_attributes,
    extract_behavioral_flags,
    aggregate_session_metrics
)

pd.set_option('display.max_columns', None)

## 1. Load & Clean Data

In [2]:
# Load Data
df = load_data('../data/train.parquet')

# Clean Data (Casting, Dropping Leakage/Redundant cols)
df = clean_data(df)

print(f"Shape after cleaning: {df.shape}")
df.head()

Shape after cleaning: (17499636, 15)


Unnamed: 0,status,gender,level,userId,ts,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,registration
0,200,M,paid,1749042,2018-10-01 00:00:01,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-08-08 13:22:21
992,200,M,paid,1749042,2018-10-01 00:08:45,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-08-08 13:22:21
1360,200,M,paid,1749042,2018-10-01 00:11:43,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-08-08 13:22:21
1825,200,M,paid,1749042,2018-10-01 00:15:35,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-08-08 13:22:21
2366,200,M,paid,1749042,2018-10-01 00:20:00,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-08-08 13:22:21


## 2. Define Target (Churn)

In [3]:
# Label Churn (10-day window)
df = label_churn(df, window_days=10)

# Check distribution
print(df['churn'].value_counts(normalize=True))

churn
0    0.896537
1    0.103463
Name: proportion, dtype: float64


## 3. Feature Extraction (Event-Level)

In [4]:
# 1. Seasonality
df = extract_seasonality(df)

# 2. User Attributes
df = extract_user_attributes(df)

# 3. Behavioral Flags
df = extract_behavioral_flags(df)

# 4. Session Metrics (Errors, Redirects)
df = aggregate_session_metrics(df)

print("Features extracted.")
df.head()

Features extracted.


Unnamed: 0,status,gender,level,userId,ts,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,registration,churn_ts,churn,hour,dayofweek,is_weekend,account_age_days,platform,state,thumbs_up,thumbs_down,roll_advert,downgrade,is_error,is_redirect
0,200,M,paid,1749042,2018-10-01 00:00:01,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.442824,Windows,TX,0,0,0,0,0,0
1,200,M,paid,1749042,2018-10-01 00:08:45,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.448889,Windows,TX,0,0,0,0,0,0
2,200,M,paid,1749042,2018-10-01 00:11:43,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.450949,Windows,TX,0,0,0,0,0,0
3,200,M,paid,1749042,2018-10-01 00:15:35,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.453634,Windows,TX,0,0,0,0,0,0
4,200,M,paid,1749042,2018-10-01 00:20:00,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-08-08 13:22:21,2018-10-21 01:16:24,0,0,0,0,53.456701,Windows,TX,0,0,0,0,0,0


## 4. Aggregation: Event-Level to User-Level

We need to aggregate the data so that we have **one row per user**.

In [5]:
# Define aggregation dictionary
agg_dict = {
    # Target
    'churn': 'max',  # If user churned at any point (in the window), they are a churner
    
    # Static Attributes (take first/mode)
    'gender': 'first',
    'level': 'last', # Current level
    'platform': 'first',
    'state': 'first',
    'registration': 'first',
    'ts': 'max', # Last active timestamp
    
    # Numerical Aggregations
    'length': ['mean', 'sum'], # Avg/Total listening time
    'itemInSession': 'mean',
    'account_age_days': 'max',
    
    # Behavioral Counts
    'thumbs_up': 'sum',
    'thumbs_down': 'sum',
    'roll_advert': 'sum',
    'downgrade': 'max', # Has ever downgraded
    'is_error': 'mean', # Error rate
    'is_redirect': 'mean', # Redirect rate
    
    # Seasonality (Mode or Avg)
    'is_weekend': 'mean' # Ratio of weekend activity
}

# Group by userId
user_df = df.groupby('userId').agg(agg_dict)

# Flatten MultiIndex columns
user_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in user_df.columns.values]
user_df.reset_index(inplace=True)

# Rename some columns for clarity
user_df.rename(columns={
    'churn_max': 'churn',
    'gender_first': 'gender',
    'level_last': 'level',
    'platform_first': 'platform',
    'state_first': 'state',
    'registration_first': 'registration',
    'ts_max': 'last_active',
    'account_age_days_max': 'account_age_days',
    'downgrade_max': 'has_downgraded',
    'is_error_mean': 'error_rate',
    'is_redirect_mean': 'redirect_rate',
    'is_weekend_mean': 'weekend_ratio'
}, inplace=True)

print(f"User-level dataset shape: {user_df.shape}")
user_df.head()

User-level dataset shape: (19140, 19)


Unnamed: 0,userId,churn,gender,level,platform,state,registration,last_active,length_mean,length_sum,itemInSession_mean,account_age_days,thumbs_up_sum,thumbs_down_sum,roll_advert_sum,has_downgraded,error_rate,redirect_rate,weekend_ratio
0,1000025,1,M,paid,Windows,CT,2018-07-10 09:30:08,2018-10-18 20:33:05,251.080982,417296.59169,122.931172,100.460382,94,13,7,0,0.000499,0.083791,0.068828
1,1000035,0,F,paid,Linux,SC,2018-09-12 19:28:22,2018-11-15 03:53:11,245.153923,310364.8659,68.43509,63.350567,117,15,6,0,0.000643,0.113111,0.392674
2,1000083,1,M,paid,Windows,OH-KY-IN,2018-09-07 18:01:49,2018-10-12 10:04:58,244.723096,122606.27093,54.45302,34.668854,21,2,8,0,0.0,0.075503,0.0
3,1000103,0,F,paid,Linux,OH,2018-09-22 07:27:25,2018-11-08 18:28:40,237.802282,13554.73009,20.866667,47.459201,2,1,3,0,0.0,0.08,0.0
4,1000164,0,F,paid,Windows,AZ,2018-08-12 09:32:01,2018-11-19 13:04:25,246.824861,209060.65753,76.555447,99.1475,38,6,20,0,0.000973,0.071984,0.051556


## 5. Final Cleanup & Save

In [6]:
# Fill NaNs if any (e.g. from aggregations)
user_df = user_df.fillna(0)

# Save
output_path = '../data/user_features.parquet'
user_df.to_parquet(output_path)
print(f"Saved processed data to {output_path}")

Saved processed data to ../data/user_features.parquet
