In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned & imputed dataset
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/data/cleaned/df_imputed.csv', index_col=0)

print("="*70)
print("STARTING FRESH: UNDERSTANDING OUR DATA")
print("="*70)

print(f"\nDataset shape: {df.shape}")
print(f"\nColumns we have:")
print(df.columns.tolist())

print(f"\nüìä Target Distribution:")
print(df['Purchase'].value_counts())
print(df['Purchase'].value_counts(normalize=True))

print(f"\nüîç Data Types:")
print(df.dtypes.value_counts())

# Identify feature types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove non-features
numeric_cols = [col for col in numeric_cols if col not in ['Purchase', 'id']]
if 'Session_ID' in categorical_cols:
    categorical_cols.remove('Session_ID')

print(f"\nüìà Numeric features ({len(numeric_cols)}):")
for col in numeric_cols:
    print(f"  - {col}")

print(f"\nüìã Categorical features ({len(categorical_cols)}):")
for col in categorical_cols:
    print(f"  - {col}")
    
print(f"\n‚úÖ Data loaded successfully")

STARTING FRESH: UNDERSTANDING OUR DATA

Dataset shape: (13735, 53)

Columns we have:
['id', 'Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Items_In_Cart', 'Email_Interaction', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'Day', 'Purchase', 'Time_of_Day_afternoon', 'Time_of_Day_evening', 'Time_of_Day_morning', 'Device_Type_desktop', 'Device_Type_mobile', 'Device_Type_tablet', 'Payment_Method_bank', 'Payment_Method_cash', 'Payment_Method_credit', 'Payment_Method_paypal', 'Referral_Source_ads', 'Referral_Source_direct', 'Referral_Source_email', 'Referral_Source_search_engine', 'Referral_Source_social_media', 'PM_RS_Combo_bank:ads', 'PM_RS_Combo_bank:direct', 'PM_RS_Combo_bank:email', 'PM_RS_Combo_bank:search_engine', 'PM_RS_Combo_bank:social_media', 'PM_RS_Combo_cash:ads', 'PM_RS_Combo_cash:direct', 'PM_RS_Combo_cash:email', 'PM_RS_Combo_cash:search_engine', 'PM_RS_Combo_cash:social_media', 'PM_RS_Combo_credit:ads', 'PM_RS_Combo_credit:d

In [None]:
print("\n" + "="*70)
print("STEP 1: VALIDATING EDA FINDINGS")
print("="*70)

# Calculate baseline purchase rate
baseline_rate = df['Purchase'].mean()
print(f"\nüìä Baseline Purchase Rate: {baseline_rate:.3f} ({baseline_rate*100:.1f}%)")

# ============================================================================
# Finding 1: Reviews_Read (+163% lift)
# ============================================================================
print("\n" + "-"*70)
print("Finding 1: Reviews_Read Impact")
print("-"*70)

# Create bins for Reviews_Read
df['Reviews_Read_Binned'] = pd.cut(df['Reviews_Read'], 
                                   bins=[-0.1, 0, 1, 5, 100], 
                                   labels=['None', 'Low(1)', 'Medium(2-5)', 'High(5+)'])

reviews_analysis = df.groupby('Reviews_Read_Binned')['Purchase'].agg(['count', 'mean'])
reviews_analysis['lift'] = (reviews_analysis['mean'] / baseline_rate - 1) * 100

print(reviews_analysis)
print(f"\nEDA claimed: +163% lift")
print(f"We observe: Max lift = {reviews_analysis['lift'].max():.1f}%")

# ============================================================================
# Finding 2: Email_Interaction (+36% lift)
# ============================================================================
print("\n" + "-"*70)
print("Finding 2: Email_Interaction Impact")
print("-"*70)

# Check if Email_Interaction is encoded or text
if df['Email_Interaction'].dtype == 'object':
    print("Email_Interaction is categorical (text)")
    print(df['Email_Interaction'].value_counts())
else:
    print("Email_Interaction is numeric (already encoded)")

email_analysis = df.groupby('Email_Interaction')['Purchase'].agg(['count', 'mean'])
email_analysis['lift'] = (email_analysis['mean'] / baseline_rate - 1) * 100

print(email_analysis)
print(f"\nEDA claimed: +36% lift")
if len(email_analysis) == 2:
    print(f"We observe: {email_analysis['lift'].iloc[1]:.1f}% lift")

# ============================================================================
# Finding 3: Device_Type (+27% lift)
# ============================================================================
print("\n" + "-"*70)
print("Finding 3: Device_Type Impact")
print("-"*70)

# Check which device type columns exist
device_cols = [col for col in df.columns if 'Device_Type' in col]
print(f"Device columns found: {device_cols}")

if len(device_cols) > 0:
    # Already one-hot encoded
    for device_col in device_cols:
        device_rate = df[df[device_col] == 1]['Purchase'].mean()
        lift = (device_rate / baseline_rate - 1) * 100
        print(f"{device_col}: {device_rate:.3f} ({lift:+.1f}% lift)")
else:
    # Not encoded yet
    device_analysis = df.groupby('Device_Type')['Purchase'].agg(['count', 'mean'])
    device_analysis['lift'] = (device_analysis['mean'] / baseline_rate - 1) * 100
    print(device_analysis)

# ============================================================================
# Finding 4: Email √ó Campaign (+78% combined lift)
# ============================================================================
print("\n" + "-"*70)
print("Finding 4: Email √ó Campaign Interaction")
print("-"*70)

# Check campaign columns
campaign_cols = [col for col in df.columns if 'Campaign' in col]
print(f"Campaign columns found: {campaign_cols}")

# Create interaction variable
if 'Campaign_Period_true' in df.columns:
    campaign_indicator = df['Campaign_Period_true']
elif 'Campaign_Period' in df.columns:
    campaign_indicator = df['Campaign_Period']
else:
    print("‚ö†Ô∏è No campaign column found!")
    campaign_indicator = None

if campaign_indicator is not None:
    # Email during campaign
    email_campaign = (df['Email_Interaction'] == 1) & (campaign_indicator == 1)
    
    interaction_analysis = pd.DataFrame({
        'Segment': ['No Email, No Campaign', 'Email Only', 'Campaign Only', 'Email √ó Campaign'],
        'Count': [
            ((df['Email_Interaction'] == 0) & (campaign_indicator == 0)).sum(),
            ((df['Email_Interaction'] == 1) & (campaign_indicator == 0)).sum(),
            ((df['Email_Interaction'] == 0) & (campaign_indicator == 1)).sum(),
            email_campaign.sum()
        ],
        'Purchase_Rate': [
            df[(df['Email_Interaction'] == 0) & (campaign_indicator == 0)]['Purchase'].mean(),
            df[(df['Email_Interaction'] == 1) & (campaign_indicator == 0)]['Purchase'].mean(),
            df[(df['Email_Interaction'] == 0) & (campaign_indicator == 1)]['Purchase'].mean(),
            df[email_campaign]['Purchase'].mean()
        ]
    })
    
    interaction_analysis['Lift_%'] = (interaction_analysis['Purchase_Rate'] / baseline_rate - 1) * 100
    print(interaction_analysis.to_string(index=False))
    print(f"\nEDA claimed: +78% combined lift")
    print(f"We observe: {interaction_analysis.iloc[3]['Lift_%']:.1f}% lift")

# ============================================================================
# Finding 5: Category (+14% lift)
# ============================================================================
print("\n" + "-"*70)
print("Finding 5: Category Impact")
print("-"*70)

if 'Category' in df.columns:
    category_analysis = df.groupby('Category')['Purchase'].agg(['count', 'mean'])
    category_analysis['lift'] = (category_analysis['mean'] / baseline_rate - 1) * 100
    print(category_analysis)

# ============================================================================
# Finding 6 & 7: Age and AB_Bucket (NO signal)
# ============================================================================
print("\n" + "-"*70)
print("Finding 6 & 7: Variables with NO Signal")
print("-"*70)

if 'Age' in df.columns:
    age_bins = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=['<30', '30-40', '40-50', '50+'])
    age_analysis = df.groupby(age_bins)['Purchase'].mean()
    print(f"\nAge purchase rates:\n{age_analysis}")
    print(f"Std dev: {age_analysis.std():.4f} (low = flat signal)")

if 'AB_Bucket' in df.columns:
    ab_analysis = df.groupby('AB_Bucket')['Purchase'].mean()
    print(f"\nAB_Bucket purchase rates:\n{ab_analysis}")
    print(f"Std dev: {ab_analysis.std():.4f} (low = flat signal)")

print("\n" + "="*70)
print("VALIDATION COMPLETE")
print("="*70)


STEP 1: VALIDATING EDA FINDINGS

üìä Baseline Purchase Rate: 0.368 (36.8%)

----------------------------------------------------------------------
Finding 1: Reviews_Read Impact
----------------------------------------------------------------------
                     count      mean       lift
Reviews_Read_Binned                            
None                   687  0.177584 -51.758068
Low(1)                2049  0.266471 -27.611049
Medium(2-5)           9902  0.391436   6.336520
High(5+)              1097  0.466727  26.789980

EDA claimed: +163% lift
We observe: Max lift = 26.8%

----------------------------------------------------------------------
Finding 2: Email_Interaction Impact
----------------------------------------------------------------------
Email_Interaction is numeric (already encoded)
                   count      mean        lift
Email_Interaction                             
0.000000            7352  0.315424  -14.312623
0.145578               1  1.000000  171.

  reviews_analysis = df.groupby('Reviews_Read_Binned')['Purchase'].agg(['count', 'mean'])
  age_analysis = df.groupby(age_bins)['Purchase'].mean()


In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/data/cleaned/df_imputed.csv', index_col=0)

print("="*70)
print("QUICK FIX: Convert problematic features to binary")
print("="*70)

# Fix Email_Interaction - make it binary
print("\nEmail_Interaction unique values:", df['Email_Interaction'].nunique())
print("Sample values:", df['Email_Interaction'].value_counts().head(10))

# Convert to binary (1 if > 0, else 0)
df['Email_Engaged'] = (df['Email_Interaction'] > 0.5).astype(int)
print(f"\nEmail_Engaged distribution:\n{df['Email_Engaged'].value_counts()}")
print(f"Purchase rate by Email_Engaged:\n{df.groupby('Email_Engaged')['Purchase'].mean()}")

# Fix Category - round to nearest integer
print("\nCategory unique values:", df['Category'].nunique())
df['Category_Clean'] = df['Category'].round().astype(int)
print(f"Category_Clean distribution:\n{df['Category_Clean'].value_counts()}")
print(f"Purchase rate by Category:\n{df.groupby('Category_Clean')['Purchase'].mean()}")

# Drop original messy columns
df = df.drop(['Email_Interaction', 'Category'], axis=1)

# Save cleaned version
df.to_csv('df_imputed_fixed.csv')
print("\n‚úÖ Saved: df_imputed_fixed.csv")

QUICK FIX: Convert problematic features to binary

Email_Interaction unique values: 250
Sample values: Email_Interaction
0.000000    7352
1.000000    6135
0.433172       1
0.664845       1
0.203583       1
0.609830       1
0.367239       1
0.394678       1
0.419264       1
0.228377       1
Name: count, dtype: int64

Email_Engaged distribution:
Email_Engaged
0    7508
1    6227
Name: count, dtype: int64
Purchase rate by Email_Engaged:
Email_Engaged
0    0.317528
1    0.429099
Name: Purchase, dtype: float64

Category unique values: 292
Category_Clean distribution:
Category_Clean
1    2845
4    2772
2    2763
0    2690
3    2665
Name: count, dtype: int64
Purchase rate by Category:
Category_Clean
0    0.426022
1    0.400351
2    0.410785
3    0.308818
4    0.293290
Name: Purchase, dtype: float64

‚úÖ Saved: df_imputed_fixed.csv


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Prepare data
X = df.drop(['Purchase', 'id', 'Session_ID'], axis=1, errors='ignore')
y = df['Purchase']

print(f"\nFeatures: {X.shape[1]}")
print(f"Samples: {len(X)}")

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train simple XGBoost
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=1.72,
    random_state=42
)

model.fit(X_train, y_train)

# Predict
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Find best threshold
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nüìä BASELINE PERFORMANCE")
print(f"Best F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

y_pred_final = (y_pred_proba >= best_thresh).astype(int)
print("\n", classification_report(y_val, y_pred_final, digits=3))


Features: 50
Samples: 13735

üìä BASELINE PERFORMANCE
Best F1: 0.8117 at threshold 0.43

               precision    recall  f1-score   support

         0.0      0.945     0.797     0.865      1736
         1.0      0.726     0.921     0.812      1011

    accuracy                          0.843      2747
   macro avg      0.835     0.859     0.838      2747
weighted avg      0.864     0.843     0.845      2747



In [None]:
# Create interaction feature
if 'Campaign_Period_true' in df.columns:
    df['Email_Campaign_Interaction'] = (
        (df['Email_Engaged'] == 1) & 
        (df['Campaign_Period_true'] == 1)
    ).astype(int)
    
    print(f"\nEmail_Campaign_Interaction distribution:")
    print(df['Email_Campaign_Interaction'].value_counts())
    print(f"Purchase rate: {df[df['Email_Campaign_Interaction']==1]['Purchase'].mean():.3f}")

# Retrain with new feature
X_enhanced = df.drop(['Purchase', 'id', 'Session_ID'], axis=1, errors='ignore')
y = df['Purchase']

X_train_enh, X_val_enh, y_train_enh, y_val_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42, stratify=y
)

model_enh = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=1.72,
    random_state=42
)

model_enh.fit(X_train_enh, y_train_enh)

y_pred_proba_enh = model_enh.predict_proba(X_val_enh)[:, 1]

# Find best threshold
best_f1_enh = 0
best_thresh_enh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba_enh >= thresh).astype(int)
    f1 = f1_score(y_val_enh, y_pred)
    if f1 > best_f1_enh:
        best_f1_enh = f1
        best_thresh_enh = thresh

print(f"\nüìä WITH EMAIL√óCAMPAIGN FEATURE")
print(f"Best F1: {best_f1_enh:.4f} (vs baseline {best_f1:.4f})")
print(f"Improvement: {best_f1_enh - best_f1:+.4f}")

if best_f1_enh > best_f1:
    print("‚úÖ Feature helped!")
else:
    print("‚ùå Feature didn't help - stick with baseline")


Email_Campaign_Interaction distribution:
Email_Campaign_Interaction
0    11472
1     2263
Name: count, dtype: int64
Purchase rate: 0.494

üìä WITH EMAIL√óCAMPAIGN FEATURE
Best F1: 0.8175 (vs baseline 0.8117)
Improvement: +0.0058
‚úÖ Feature helped!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Load fixed data
df = pd.read_csv('df_imputed_fixed.csv', index_col=0)

print("="*70)
print("TESTING ADDITIONAL VALIDATED FEATURES")
print("="*70)

# Current features we have
print(f"Current features: {df.shape[1]}")

# ============================================================================
# Feature 1: Reviews_Read_Binary (+27% lift validated)
# ============================================================================
df['Reviews_Read_Any'] = (df['Reviews_Read'] > 0).astype(int)

print(f"\n1Ô∏è‚É£ Reviews_Read_Any:")
print(f"   Distribution: {df['Reviews_Read_Any'].value_counts().to_dict()}")
read_rate = df[df['Reviews_Read_Any'] == 1]['Purchase'].mean()
baseline = df['Purchase'].mean()
print(f"   Purchase rate: {read_rate:.3f} (baseline: {baseline:.3f})")
print(f"   Lift: {(read_rate/baseline - 1)*100:+.1f}%")

# ============================================================================
# Feature 2: Device_Is_Tablet (+17% lift validated)
# ============================================================================
if 'Device_Type_tablet' in df.columns:
    df['Is_Tablet_User'] = df['Device_Type_tablet']
    tablet_rate = df[df['Is_Tablet_User'] == 1]['Purchase'].mean()
    print(f"\n2Ô∏è‚É£ Is_Tablet_User:")
    print(f"   Distribution: {df['Is_Tablet_User'].value_counts().to_dict()}")
    print(f"   Purchase rate: {tablet_rate:.3f}")
    print(f"   Lift: {(tablet_rate/baseline - 1)*100:+.1f}%")

# ============================================================================
# Feature 3: High_Engagement_Shopper (Reviews + Items in Cart)
# ============================================================================
# Combining two signals: reads reviews AND has items in cart
df['High_Engagement_Shopper'] = (
    (df['Reviews_Read'] > 0) & 
    (df['Items_In_Cart'] > 0)
).astype(int)

print(f"\n3Ô∏è‚É£ High_Engagement_Shopper (Reviews + Cart):")
print(f"   Distribution: {df['High_Engagement_Shopper'].value_counts().to_dict()}")
engaged_rate = df[df['High_Engagement_Shopper'] == 1]['Purchase'].mean()
print(f"   Purchase rate: {engaged_rate:.3f}")
print(f"   Lift: {(engaged_rate/baseline - 1)*100:+.1f}%")

# ===============

TESTING ADDITIONAL VALIDATED FEATURES
Current features: 53

1Ô∏è‚É£ Reviews_Read_Any:
   Distribution: {1: 13048, 0: 687}
   Purchase rate: 0.378 (baseline: 0.368)
   Lift: +2.7%

2Ô∏è‚É£ Is_Tablet_User:
   Distribution: {0.0: 11730, 1.0: 2005}
   Purchase rate: 0.431
   Lift: +17.2%

3Ô∏è‚É£ High_Engagement_Shopper (Reviews + Cart):
   Distribution: {1: 11889, 0: 1846}
   Purchase rate: 0.331
   Lift: -10.1%


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

# Load data
df = pd.read_csv('/Users/jakobbullinger/Documents/Coding Projects/DSBA/Intro Machine Learning/kaggle_competition/notebooks/Jakob/df_imputed_fixed.csv', index_col=0)


print("="*70)
print("TRAINING WITH ONLY GOOD FEATURES")
print("="*70)

# Add only features that showed positive lift
df['Email_Campaign_Interaction'] = (
    (df['Email_Engaged'] == 1) & 
    (df['Campaign_Period_true'] == 1)
).astype(int)

# Tablet user is already encoded as Device_Type_tablet, no need to add

print(f"‚úÖ Added 1 engineered feature: Email_Campaign_Interaction")
print(f"‚úÖ Using existing feature: Device_Type_tablet (+17% lift)")

# Prepare data
feature_cols = [col for col in df.columns 
                if col not in ['Purchase', 'id', 'Session_ID']]

X = df[feature_cols]
y = df['Purchase']

print(f"\nTotal features: {len(feature_cols)}")

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train with regularization
model = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.1,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1.72,
    random_state=42
)

model.fit(X_train, y_train)

# Predict
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Find optimal threshold
best_f1 = 0
best_thresh = 0.5

for thresh in np.arange(0.3, 0.7, 0.01):
    y_pred = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\nüìä PERFORMANCE")
print(f"Best F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

y_pred_final = (y_pred_proba >= best_thresh).astype(int)
print("\n", classification_report(y_val, y_pred_final, digits=3))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n" + "="*70)
print("TOP 20 FEATURES")
print("="*70)
print(feature_importance.head(20).to_string(index=False))

# Check our engineered feature
email_campaign_importance = feature_importance[
    feature_importance['feature'] == 'Email_Campaign_Interaction'
]
if not email_campaign_importance.empty:
    rank = feature_importance.index[feature_importance['feature'] == 'Email_Campaign_Interaction'].tolist()[0] + 1
    print(f"\nüìä Email_Campaign_Interaction:")
    print(f"   Importance: {email_campaign_importance['importance'].values[0]:.6f}")
    print(f"   Rank: #{rank} out of {len(feature_cols)}")

# Comparison
print("\n" + "="*70)
print("FINAL COMPARISON")
print("="*70)
print(f"Your original best XGBoost:     F1 = 0.8185")
print(f"Today's baseline:               F1 = 0.8117")
print(f"With Email√óCampaign feature:    F1 = 0.8175")
print(f"Final model (optimized):        F1 = {best_f1:.4f}")

if best_f1 >= 0.8185:
    print(f"\nüéâ NEW BEST! (+{best_f1 - 0.8185:.4f} improvement)")
    decision = "SUBMIT THIS MODEL"
elif best_f1 >= 0.8175:
    print(f"\n‚úÖ Solid performance, close to best")
    decision = "SUBMIT THIS MODEL OR TRY CLUSTERING"
else:
    print(f"\n‚ö†Ô∏è Not better than baseline")
    decision = "REVERT TO ORIGINAL MODEL (0.8185)"

print(f"\nüí° RECOMMENDATION: {decision}")

# Save
import pickle
with open('xgboost_final.pkl', 'wb') as f:
    pickle.dump(model, f)
    
df.to_csv('df_final_features.csv')
print("\n‚úÖ Saved: xgboost_final.pkl")
print("‚úÖ Saved: df_final_features.csv")

SyntaxError: invalid syntax (1880423804.py, line 124)