In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import balanced_accuracy_score


def process_data_smart(df):
    features = pd.crosstab(df['userId'], df['page'])
    features.columns.name = None
    features.reset_index(inplace=True)
    
    songs = features.get('NextSong', 0).replace(0, 1)
    ads = features.get('Roll Advert', 0)
    features['ad_pressure'] = ads / songs
    
    features['net_satisfaction'] = features.get('Thumbs Up', 0) - features.get('Thumbs Down', 0)

    features['account_volatility'] = (
        features.get('Downgrade', 0) + 
        features.get('Submit Upgrade', 0) + 
        features.get('Settings', 0)
    )

    features['total_actions'] = features.drop(columns=['userId'], errors='ignore').sum(axis=1).replace(0, 1)
    features['error_rate'] = features.get('Error', 0) / features['total_actions']
    
    meta = df.sort_values('ts').groupby('userId')[['level', 'gender']].last()
    features['is_paid'] = (meta['level'] == 'paid').astype(int).values
    features['is_male'] = (meta['gender'] == 'M').astype(int).values

    return features



df_train = pd.read_parquet('train.parquet')
df_test = pd.read_parquet('test.parquet')

train_features = process_data_smart(df_train)
test_features = process_data_smart(df_test)


churn_users = df_train[df_train['page'] == 'Cancellation Confirmation']['userId'].unique()
train_features['target'] = train_features['userId'].isin(churn_users).astype(int)


cols_to_drop = ['userId', 'target', 'Cancellation Confirmation', 'Cancel']
feature_cols = [c for c in train_features.columns if c not in cols_to_drop]

for col in feature_cols:
    if col not in test_features.columns: test_features[col] = 0

X = train_features[feature_cols].fillna(0)
y = train_features['target']
X_test_final = test_features[feature_cols].fillna(0)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)




base_lr = LogisticRegression(
    C=0.1, 
    class_weight='balanced', 
    solver='liblinear', 
    max_iter=2000
)


model = BaggingClassifier(
    estimator=base_lr,
    n_estimators=15,    
    max_samples=0.8,    
    bootstrap=True,
    random_state=42,
    n_jobs=1            
)


y_scores = cross_val_predict(model, X_scaled, y, cv=5, method='predict_proba', n_jobs=1)[:, 1]

best_thresh = 0.5
best_score = 0
thresholds = np.arange(0.35, 0.65, 0.005) 

for t in thresholds:
    score = balanced_accuracy_score(y, (y_scores >= t).astype(int))
    if score > best_score:
        best_score = score
        best_thresh = t

print('best thresh: {}'.format(best_thresh))
print('score: {} '.format(best_score))

model.fit(X_scaled, y)
test_probs = model.predict_proba(X_test_scaled)[:, 1]
final_preds = (test_probs >= best_thresh).astype(int)

submission = pd.DataFrame({
    'id': test_features['userId'],
    'target': final_preds
})

filename = 'submission_final.csv'
submission.to_csv(filename, index=False)
print('total nb of chunrers predicted by the model: {}'.format(sum(final_preds)))

best thresh: 0.5100000000000001
score: 0.6558314186303772 
total nb of chunrers predicted by the model: 1090
