In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned & imputed dataset
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/data/cleaned/df_imputed.csv', index_col=0)

print("="*70)
print("STARTING FRESH: UNDERSTANDING OUR DATA")
print("="*70)

print(f"\nDataset shape: {df.shape}")
print(f"\nColumns we have:")
print(df.columns.tolist())

print(f"\nüìä Target Distribution:")
print(df['Purchase'].value_counts())
print(df['Purchase'].value_counts(normalize=True))

print(f"\nüîç Data Types:")
print(df.dtypes.value_counts())

# Identify feature types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove non-features
numeric_cols = [col for col in numeric_cols if col not in ['Purchase', 'id']]
if 'Session_ID' in categorical_cols:
    categorical_cols.remove('Session_ID')

print(f"\nüìà Numeric features ({len(numeric_cols)}):")
for col in numeric_cols:
    print(f"  - {col}")

print(f"\nüìã Categorical features ({len(categorical_cols)}):")
for col in categorical_cols:
    print(f"  - {col}")
    
print(f"\n‚úÖ Data loaded successfully")

STARTING FRESH: UNDERSTANDING OUR DATA

Dataset shape: (13735, 53)

Columns we have:
['id', 'Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Items_In_Cart', 'Email_Interaction', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'Day', 'Purchase', 'Time_of_Day_afternoon', 'Time_of_Day_evening', 'Time_of_Day_morning', 'Device_Type_desktop', 'Device_Type_mobile', 'Device_Type_tablet', 'Payment_Method_bank', 'Payment_Method_cash', 'Payment_Method_credit', 'Payment_Method_paypal', 'Referral_Source_ads', 'Referral_Source_direct', 'Referral_Source_email', 'Referral_Source_search_engine', 'Referral_Source_social_media', 'PM_RS_Combo_bank:ads', 'PM_RS_Combo_bank:direct', 'PM_RS_Combo_bank:email', 'PM_RS_Combo_bank:search_engine', 'PM_RS_Combo_bank:social_media', 'PM_RS_Combo_cash:ads', 'PM_RS_Combo_cash:direct', 'PM_RS_Combo_cash:email', 'PM_RS_Combo_cash:search_engine', 'PM_RS_Combo_cash:social_media', 'PM_RS_Combo_credit:ads', 'PM_RS_Combo_credit:d

In [3]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/data/cleaned/df_imputed.csv', index_col=0)

print("="*70)
print("QUICK FIX: Convert problematic features to binary")
print("="*70)

# Fix Email_Interaction - make it binary
print("\nEmail_Interaction unique values:", df['Email_Interaction'].nunique())
print("Sample values:", df['Email_Interaction'].value_counts().head(10))

# Convert to binary (1 if > 0, else 0)
df['Email_Engaged'] = (df['Email_Interaction'] > 0.5).astype(int)
print(f"\nEmail_Engaged distribution:\n{df['Email_Engaged'].value_counts()}")
print(f"Purchase rate by Email_Engaged:\n{df.groupby('Email_Engaged')['Purchase'].mean()}")

# Fix Category - round to nearest integer
print("\nCategory unique values:", df['Category'].nunique())
df['Category_Clean'] = df['Category'].round().astype(int)
print(f"Category_Clean distribution:\n{df['Category_Clean'].value_counts()}")
print(f"Purchase rate by Category:\n{df.groupby('Category_Clean')['Purchase'].mean()}")

# Drop original messy columns
df = df.drop(['Email_Interaction', 'Category'], axis=1)

# Save cleaned version
df.to_csv('df_imputed_fixed.csv')
print("\n‚úÖ Saved: df_imputed_fixed.csv")

QUICK FIX: Convert problematic features to binary

Email_Interaction unique values: 250
Sample values: Email_Interaction
0.000000    7352
1.000000    6135
0.433172       1
0.664845       1
0.203583       1
0.609830       1
0.367239       1
0.394678       1
0.419264       1
0.228377       1
Name: count, dtype: int64

Email_Engaged distribution:
Email_Engaged
0    7508
1    6227
Name: count, dtype: int64
Purchase rate by Email_Engaged:
Email_Engaged
0    0.317528
1    0.429099
Name: Purchase, dtype: float64

Category unique values: 292
Category_Clean distribution:
Category_Clean
1    2845
4    2772
2    2763
0    2690
3    2665
Name: count, dtype: int64
Purchase rate by Category:
Category_Clean
0    0.426022
1    0.400351
2    0.410785
3    0.308818
4    0.293290
Name: Purchase, dtype: float64

‚úÖ Saved: df_imputed_fixed.csv


In [4]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Prepare data
X = df.drop(['Purchase', 'id', 'Session_ID'], axis=1, errors='ignore')
y = df['Purchase']

print(f"\nFeatures: {X.shape[1]}")
print(f"Samples: {len(X)}")

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train simple XGBoost
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=1.72,
    random_state=42
)

model.fit(X_train, y_train)

# Predict
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Find best threshold
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nüìä BASELINE PERFORMANCE")
print(f"Best F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

y_pred_final = (y_pred_proba >= best_thresh).astype(int)
print("\n", classification_report(y_val, y_pred_final, digits=3))


Features: 50
Samples: 13735

üìä BASELINE PERFORMANCE
Best F1: 0.8117 at threshold 0.43

               precision    recall  f1-score   support

         0.0      0.945     0.797     0.865      1736
         1.0      0.726     0.921     0.812      1011

    accuracy                          0.843      2747
   macro avg      0.835     0.859     0.838      2747
weighted avg      0.864     0.843     0.845      2747



In [5]:
# Create interaction feature
if 'Campaign_Period_true' in df.columns:
    df['Email_Campaign_Interaction'] = (
        (df['Email_Engaged'] == 1) & 
        (df['Campaign_Period_true'] == 1)
    ).astype(int)
    
    print(f"\nEmail_Campaign_Interaction distribution:")
    print(df['Email_Campaign_Interaction'].value_counts())
    print(f"Purchase rate: {df[df['Email_Campaign_Interaction']==1]['Purchase'].mean():.3f}")

# Retrain with new feature
X_enhanced = df.drop(['Purchase', 'id', 'Session_ID'], axis=1, errors='ignore')
y = df['Purchase']

X_train_enh, X_val_enh, y_train_enh, y_val_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42, stratify=y
)

model_enh = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=1.72,
    random_state=42
)

model_enh.fit(X_train_enh, y_train_enh)

y_pred_proba_enh = model_enh.predict_proba(X_val_enh)[:, 1]

# Find best threshold
best_f1_enh = 0
best_thresh_enh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba_enh >= thresh).astype(int)
    f1 = f1_score(y_val_enh, y_pred)
    if f1 > best_f1_enh:
        best_f1_enh = f1
        best_thresh_enh = thresh

print(f"\nüìä WITH EMAIL√óCAMPAIGN FEATURE")
print(f"Best F1: {best_f1_enh:.4f} (vs baseline {best_f1:.4f})")
print(f"Improvement: {best_f1_enh - best_f1:+.4f}")

if best_f1_enh > best_f1:
    print("‚úÖ Feature helped!")
else:
    print("‚ùå Feature didn't help - stick with baseline")


Email_Campaign_Interaction distribution:
Email_Campaign_Interaction
0    11472
1     2263
Name: count, dtype: int64
Purchase rate: 0.494

üìä WITH EMAIL√óCAMPAIGN FEATURE
Best F1: 0.8175 (vs baseline 0.8117)
Improvement: +0.0058
‚úÖ Feature helped!


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Load fixed data
df = pd.read_csv('df_imputed_fixed.csv', index_col=0)

print("="*70)
print("TESTING ADDITIONAL VALIDATED FEATURES")
print("="*70)

# Current features we have
print(f"Current features: {df.shape[1]}")

# ============================================================================
# Feature 1: Reviews_Read_Binary (+27% lift validated)
# ============================================================================
df['Reviews_Read_Any'] = (df['Reviews_Read'] > 0).astype(int)

print(f"\n1Ô∏è‚É£ Reviews_Read_Any:")
print(f"   Distribution: {df['Reviews_Read_Any'].value_counts().to_dict()}")
read_rate = df[df['Reviews_Read_Any'] == 1]['Purchase'].mean()
baseline = df['Purchase'].mean()
print(f"   Purchase rate: {read_rate:.3f} (baseline: {baseline:.3f})")
print(f"   Lift: {(read_rate/baseline - 1)*100:+.1f}%")

# ============================================================================
# Feature 2: Device_Is_Tablet (+17% lift validated)
# ============================================================================
if 'Device_Type_tablet' in df.columns:
    df['Is_Tablet_User'] = df['Device_Type_tablet']
    tablet_rate = df[df['Is_Tablet_User'] == 1]['Purchase'].mean()
    print(f"\n2Ô∏è‚É£ Is_Tablet_User:")
    print(f"   Distribution: {df['Is_Tablet_User'].value_counts().to_dict()}")
    print(f"   Purchase rate: {tablet_rate:.3f}")
    print(f"   Lift: {(tablet_rate/baseline - 1)*100:+.1f}%")

# ============================================================================
# Feature 3: High_Engagement_Shopper (Reviews + Items in Cart)
# ============================================================================
# Combining two signals: reads reviews AND has items in cart
df['High_Engagement_Shopper'] = (
    (df['Reviews_Read'] > 0) & 
    (df['Items_In_Cart'] > 0)
).astype(int)

print(f"\n3Ô∏è‚É£ High_Engagement_Shopper (Reviews + Cart):")
print(f"   Distribution: {df['High_Engagement_Shopper'].value_counts().to_dict()}")
engaged_rate = df[df['High_Engagement_Shopper'] == 1]['Purchase'].mean()
print(f"   Purchase rate: {engaged_rate:.3f}")
print(f"   Lift: {(engaged_rate/baseline - 1)*100:+.1f}%")

# ===============

TESTING ADDITIONAL VALIDATED FEATURES
Current features: 53

1Ô∏è‚É£ Reviews_Read_Any:
   Distribution: {1: 13048, 0: 687}
   Purchase rate: 0.378 (baseline: 0.368)
   Lift: +2.7%

2Ô∏è‚É£ Is_Tablet_User:
   Distribution: {0.0: 11730, 1.0: 2005}
   Purchase rate: 0.431
   Lift: +17.2%

3Ô∏è‚É£ High_Engagement_Shopper (Reviews + Cart):
   Distribution: {1: 11889, 0: 1846}
   Purchase rate: 0.331
   Lift: -10.1%


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Load data
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/notebooks/Jakob/df_imputed_fixed.csv', index_col=0)


print("="*70)
print("TRAINING WITH ONLY GOOD FEATURES")
print("="*70)

# Add only features that showed positive lift
df['Email_Campaign_Interaction'] = (
    (df['Email_Engaged'] == 1) & 
    (df['Campaign_Period_true'] == 1)
).astype(int)

# Tablet user is already encoded as Device_Type_tablet, no need to add

print(f"‚úÖ Added 1 engineered feature: Email_Campaign_Interaction")
print(f"‚úÖ Using existing feature: Device_Type_tablet (+17% lift)")

# Prepare data
feature_cols = [col for col in df.columns 
                if col not in ['Purchase', 'id', 'Session_ID']]

X = df[feature_cols]
y = df['Purchase']

print(f"\nTotal features: {len(feature_cols)}")

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train with regularization
model = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1.72,
    random_state=42
)

model.fit(X_train, y_train)

# Predict
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Find optimal threshold
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nüìä PERFORMANCE")
print(f"Best F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

y_pred_final = (y_pred_proba >= best_thresh).astype(int)
print("\n", classification_report(y_val, y_pred_final, digits=3))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n" + "="*70)
print("TOP 20 FEATURES")
print("="*70)
print(feature_importance.head(20).to_string(index=False))

# Check our engineered feature
email_campaign_importance = feature_importance[
    feature_importance['feature'] == 'Email_Campaign_Interaction'
]
if not email_campaign_importance.empty:
    rank = feature_importance.index[feature_importance['feature'] == 'Email_Campaign_Interaction'].tolist()[0] + 1
    print(f"\nüìä Email_Campaign_Interaction:")
    print(f"   Importance: {email_campaign_importance['importance'].values[0]:.6f}")
    print(f"   Rank: #{rank} out of {len(feature_cols)}")

# Comparison
print("\n" + "="*70)
print("FINAL COMPARISON")
print("="*70)
print(f"Your original best XGBoost:     F1 = 0.8185")
print(f"Today's baseline:               F1 = 0.8117")
print(f"With Email√óCampaign feature:    F1 = 0.8175")
print(f"Final model (optimized):        F1 = {best_f1:.4f}")

if best_f1 >= 0.8185:
    print(f"\nüéâ NEW BEST! (+{best_f1 - 0.8185:.4f} improvement)")
    decision = "SUBMIT THIS MODEL"
elif best_f1 >= 0.8175:
    print(f"\n‚úÖ Solid performance, close to best")
    decision = "SUBMIT THIS MODEL OR TRY CLUSTERING"
else:
    print(f"\n‚ö†Ô∏è Not better than baseline")
    decision = "REVERT TO ORIGINAL MODEL (0.8185)"

print(f"\nüí° RECOMMENDATION: {decision}")

# Save
import pickle
with open('xgboost_final.pkl', 'wb') as f:
    pickle.dump(model, f)
    
df.to_csv('df_final_features.csv')
print("\n‚úÖ Saved: xgboost_final.pkl")
print("‚úÖ Saved: df_final_features.csv")

TRAINING WITH ONLY GOOD FEATURES
‚úÖ Added 1 engineered feature: Email_Campaign_Interaction
‚úÖ Using existing feature: Device_Type_tablet (+17% lift)

Total features: 51

üìä PERFORMANCE
Best F1: 0.8178 at threshold 0.49

               precision    recall  f1-score   support

         0.0      0.930     0.831     0.878      1736
         1.0      0.755     0.892     0.818      1011

    accuracy                          0.854      2747
   macro avg      0.842     0.862     0.848      2747
weighted avg      0.865     0.854     0.856      2747


TOP 20 FEATURES
                       feature  importance
                 Items_In_Cart    0.195499
                 Email_Engaged    0.047328
          Campaign_Period_true    0.039422
            Device_Type_mobile    0.035994
    Email_Campaign_Interaction    0.035575
                  Reviews_Read    0.034864
                Category_Clean    0.033629
         Campaign_Period_false    0.033613
                         Price    0.033124
 

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

print("="*70)
print("ADDING CLUSTERING (PAOLA'S APPROACH) TO YOUR MODEL")
print("="*70)

# Load your processed data
df = pd.read_csv('df_final_features.csv', index_col=0)

# Prepare data
X = df.drop(['Purchase', 'id', 'Session_ID'], axis=1, errors='ignore')
y = df['Purchase']

print(f"Starting features: {X.shape[1]}")

# Split first (important: cluster on training data only)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}")

# ============================================================================
# STEP 1: Create Clusters (K-Means on Training Data)
# ============================================================================
print("\n" + "-"*70)
print("STEP 1: Training K-Means Clustering")
print("-"*70)

# Scale features for clustering
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train K-Means (using Paola's n_clusters=4)
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(X_train_scaled)

# Assign clusters
train_clusters = kmeans.labels_
val_clusters = kmeans.predict(X_val_scaled)

print(f"‚úÖ Created {n_clusters} clusters")
print(f"\nTraining cluster distribution:")
print(pd.Series(train_clusters).value_counts().sort_index())

# Analyze clusters
print(f"\nCluster purchase rates (training):")
cluster_analysis = pd.DataFrame({
    'Cluster': train_clusters,
    'Purchase': y_train
})
cluster_rates = cluster_analysis.groupby('Cluster')['Purchase'].agg(['count', 'mean'])
cluster_rates['lift_%'] = (cluster_rates['mean'] / y_train.mean() - 1) * 100
print(cluster_rates)

# ============================================================================
# STEP 2: Add Cluster as Features
# ============================================================================
print("\n" + "-"*70)
print("STEP 2: Adding Cluster Features")
print("-"*70)

# Add cluster as categorical feature (one-hot encode)
X_train_with_cluster = X_train.copy()
X_val_with_cluster = X_val.copy()

X_train_with_cluster['Cluster'] = train_clusters
X_val_with_cluster['Cluster'] = val_clusters

# One-hot encode cluster (create dummy variables)
X_train_final = pd.get_dummies(X_train_with_cluster, columns=['Cluster'], prefix='Cluster')
X_val_final = pd.get_dummies(X_val_with_cluster, columns=['Cluster'], prefix='Cluster')

# Align columns (in case validation is missing a cluster)
for col in X_train_final.columns:
    if col not in X_val_final.columns:
        X_val_final[col] = 0

X_val_final = X_val_final[X_train_final.columns]

print(f"‚úÖ Features after adding clusters: {X_train_final.shape[1]}")
print(f"   Added {n_clusters} cluster features")

# ============================================================================
# STEP 3: Train XGBoost with Clusters
# ============================================================================
print("\n" + "-"*70)
print("STEP 3: Training XGBoost with Cluster Features")
print("-"*70)

model_with_clusters = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1.72,
    random_state=42
)

model_with_clusters.fit(X_train_final, y_train)

# Predict
y_pred_proba = model_with_clusters.predict_proba(X_val_final)[:, 1]

# Find optimal threshold
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nüìä PERFORMANCE WITH CLUSTERS")
print(f"Best F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

y_pred_final = (y_pred_proba >= best_thresh).astype(int)
print("\n", classification_report(y_val, y_pred_final, digits=3))

# ============================================================================
# STEP 4: Feature Importance
# ============================================================================
print("\n" + "-"*70)
print("STEP 4: Feature Importance Analysis")
print("-"*70)

feature_importance = pd.DataFrame({
    'feature': X_train_final.columns,
    'importance': model_with_clusters.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Features:")
print(feature_importance.head(20).to_string(index=False))

# Check cluster feature importance
cluster_features = [col for col in feature_importance['feature'] if 'Cluster_' in col]
cluster_importance_df = feature_importance[feature_importance['feature'].isin(cluster_features)]

if not cluster_importance_df.empty:
    print(f"\nüìä Cluster Feature Importance:")
    print(cluster_importance_df.to_string(index=False))
    total_cluster_importance = cluster_importance_df['importance'].sum()
    print(f"   Total cluster importance: {total_cluster_importance:.6f}")
else:
    print("\n‚ö†Ô∏è No cluster features in importance (might be too weak)")

# ============================================================================
# FINAL COMPARISON
# ============================================================================
print("\n" + "="*70)
print("FINAL DECISION")
print("="*70)

results = {
    'Original best XGBoost': 0.8185,
    'Without clusters': 0.8178,
    'WITH clusters': best_f1
}

for name, score in results.items():
    print(f"{name:.<35} F1 = {score:.4f}")

improvement = best_f1 - 0.8178
print(f"\nCluster improvement: {improvement:+.4f}")

if best_f1 > 0.8185:
    print("\nüéâ NEW BEST MODEL! Clusters helped!")
    print("‚úÖ SUBMIT THIS MODEL")
    best_model_to_use = model_with_clusters
    best_data_to_use = (X_train_final, X_val_final)
elif best_f1 >= 0.8178:
    print("\n‚úÖ Clusters helped slightly or maintained performance")
    print("üí° SUBMIT MODEL WITH CLUSTERS (more interpretable)")
    best_model_to_use = model_with_clusters
    best_data_to_use = (X_train_final, X_val_final)
else:
    print("\n‚ö†Ô∏è Clusters didn't help")
    print("üí° SUBMIT ORIGINAL 0.8185 MODEL")
    best_model_to_use = None

# Save if improved
if best_f1 >= 0.8178:
    import pickle
    with open('xgboost_with_clusters.pkl', 'wb') as f:
        pickle.dump(model_with_clusters, f)
    with open('kmeans_model.pkl', 'wb') as f:
        pickle.dump(kmeans, f)
    with open('scaler_for_kmeans.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    print("\n‚úÖ Saved: xgboost_with_clusters.pkl")
    print("‚úÖ Saved: kmeans_model.pkl")
    print("‚úÖ Saved: scaler_for_kmeans.pkl")

ADDING CLUSTERING (PAOLA'S APPROACH) TO YOUR MODEL
Starting features: 51
Train: 10988, Val: 2747

----------------------------------------------------------------------
STEP 1: Training K-Means Clustering
----------------------------------------------------------------------
‚úÖ Created 4 clusters

Training cluster distribution:
0    4014
1    2617
2    2138
3    2219
Name: count, dtype: int64

Cluster purchase rates (training):
         count      mean    lift_%
Cluster                           
0         4014  0.375436  1.984932
1         2617  0.369507  0.374380
2         2138  0.357343 -2.929832
3         2219  0.363677 -1.209233

----------------------------------------------------------------------
STEP 2: Adding Cluster Features
----------------------------------------------------------------------
‚úÖ Features after adding clusters: 55
   Added 4 cluster features

----------------------------------------------------------------------
STEP 3: Training XGBoost with Cluster Featu