In [1]:
# ==========================================
# 0) Toggles and Setup
# ==========================================
RUN_CLASSIFICATION = True
RUN_UPLIFT = True

RANDOM_STATE = 42

import pandas as pd

CSV_URL = "https://raw.githubusercontent.com/SurajChouhan14/eda-to-action-conversion-ml/main/digital_marketing_campaign_dataset.csv"

try:
    df = pd.read_csv(CSV_URL)
    print("Loaded from GitHub raw:", CSV_URL, "| Rows:", len(df))
except Exception as e:
    print("Failed to load from GitHub raw. Error:", e)
    print("Fallback: upload the CSV or mount Drive as described below.")

import numpy as np
import pandas as pd
np.random.seed(RANDOM_STATE)

# Plots
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn utilities
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    average_precision_score, precision_recall_curve, roc_curve, auc
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

Loaded from GitHub raw: https://raw.githubusercontent.com/SurajChouhan14/eda-to-action-conversion-ml/main/digital_marketing_campaign_dataset.csv | Rows: 8000


In [2]:
# Models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [3]:


# Drop single-value columns (no signal)
for c in ['AdvertisingPlatform', 'AdvertisingTool']:
    if c in df.columns:
        df = df.drop(columns=c)

# Keep ID separate (not used as feature)
id_col = 'CustomerID' if 'CustomerID' in df.columns else None

# Basic checks
print("Shape:", df.shape)
print("Nulls:\n", df.isnull().sum())


Shape: (8000, 18)
Nulls:
 CustomerID           0
Age                  0
Gender               0
Income               0
CampaignChannel      0
CampaignType         0
AdSpend              0
ClickThroughRate     0
ConversionRate       0
WebsiteVisits        0
PagesPerVisit        0
TimeOnSite           0
SocialShares         0
EmailOpens           0
EmailClicks          0
PreviousPurchases    0
LoyaltyPoints        0
Conversion           0
dtype: int64


In [4]:
#  Quick KPIs and interactive EDA (Plotly)
print("Conversion rate (1s):", df['Conversion'].mean())
print("CTR (mean of ClickThroughRate):", df['ClickThroughRate'].mean())
print("CPA (total AdSpend / #conversions):", df['AdSpend'].sum() / max(1, df['Conversion'].sum()))

# Conversion rate by CampaignChannel (observed 0/1)
by_chan = df.groupby('CampaignChannel')['Conversion'].mean().reset_index()
fig = px.bar(by_chan, x='CampaignChannel', y='Conversion',
             title='Observed Conversion by Campaign Channel', color='CampaignChannel', text_auto='.2f')
fig.update_layout(showlegend=False, xaxis_title='Campaign Channel', yaxis_title='Conversion Rate')
fig.show()

# Age and Income distributions
fig = px.histogram(df, x='Age', nbins=20, marginal='box', title='Age Distribution')
fig.show()
fig = px.histogram(df, x='Income', nbins=20, marginal='box', title='Income Distribution')
fig.show()

# Ad Spend vs Conversion Rate (column in dataset)
fig = px.scatter(df, x='AdSpend', y='ConversionRate', title='Ad Spend vs Conversion Rate',
                 trendline='ols', opacity=0.6)
fig.show()

# Cost breakdown by channel
fig = px.bar(df, x='CampaignChannel', y='AdSpend', title='Ad Spend by Campaign Channel',
             color='CampaignChannel', text_auto='.2s')
fig.update_layout(showlegend=False)
fig.show()


Conversion rate (1s): 0.8765
CTR (mean of ClickThroughRate): 0.15482864915095626
CPA (total AdSpend / #conversions): 5705.584518454024


In [5]:
# Classification
if RUN_CLASSIFICATION:
    print("\n=== Classification Section ===")

    target_col = 'Conversion'
    assert target_col in df.columns, "Target column 'Conversion' not found."

    # Prepare X, y
    y = df[target_col].astype(int)
    drop_cols = [target_col] + ([id_col] if id_col in df.columns else [])
    X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    # Identify types
    numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()

    # Stratified split to preserve class ratio
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    # Preprocess: scale numerics, OHE categoricals
    preprocess = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ],
        remainder='drop'
    )

    # Model: XGBoost as a strong tabular baseline
    xgb_model = XGBClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1,
        eval_metric='logloss'
    )

    # Leakage-safe pipeline: preprocess -> SMOTE -> model
    clf = ImbPipeline(steps=[
        ('preprocess', preprocess),
        ('smote', SMOTE(random_state=RANDOM_STATE)),
        ('model', xgb_model)
    ])

    # Cross-validation with PR-AUC (for imbalance)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_pr_auc = cross_val_score(clf, X_train, y_train, scoring='average_precision', cv=cv, n_jobs=-1)
    cv_roc_auc = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    print(f"CV PR-AUC: {cv_pr_auc.mean():.3f} ± {cv_pr_auc.std():.3f}")
    print(f"CV ROC-AUC: {cv_roc_auc.mean():.3f} ± {cv_roc_auc.std():.3f}")

    # Fit on training and evaluate on untouched test
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)[:, 1]

    # Default threshold 0.5
    y_pred_default = (y_proba >= 0.5).astype(int)
    print("\n-- Default threshold (0.5) --")
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))
    print("PR-AUC:", average_precision_score(y_test, y_proba))
    print(classification_report(y_test, y_pred_default, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_default))

    # Interactive PR curve
    precision, recall, thr_pr = precision_recall_curve(y_test, y_proba)
    fig = go.Figure(go.Scatter(x=recall, y=precision, mode='lines', name='PR'))
    fig.update_layout(title='Precision-Recall Curve', xaxis_title='Recall', yaxis_title='Precision')
    fig.show()

    # Interactive ROC curve
    fpr, tpr, thr_roc = roc_curve(y_test, y_proba)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC'))
    fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)
    fig.update_layout(title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})', xaxis_title='FPR', yaxis_title='TPR')
    fig.show()

    # Threshold tuning (maximize F1; also show recall-oriented threshold)
    eps = 1e-12
    f1_scores = 2 * (precision * recall) / (precision + recall + eps)
    best_idx = f1_scores.argmax()
    best_threshold = thr_pr[max(best_idx - 1, 0)] if best_idx < len(thr_pr) else 0.5

    target_recall = 0.95
    recall_mask = recall >= target_recall
    recall_threshold = thr_pr[recall_mask.argmax()] if recall_mask.any() else best_threshold

    def evaluate_threshold(th):
        y_pred = (y_proba >= th).astype(int)
        print(f"\n-- Threshold={th:.3f} --")
        print(classification_report(y_test, y_pred, digits=3))
        print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    print(f"\nBest F1 threshold: {best_threshold:.3f}")
    evaluate_threshold(best_threshold)

    print(f"\nRecall-oriented threshold (≥{target_recall:.2f} recall target): {recall_threshold:.3f}")
    evaluate_threshold(recall_threshold)

    # Feature importance (global)
    def get_feature_names(ct, num_cols, cat_cols):
        num_feats = ct.named_transformers_['num'].get_feature_names_out(num_cols)
        cat_feats = ct.named_transformers_['cat'].get_feature_names_out(cat_cols)
        return np.concatenate([num_feats, cat_feats])

    trained_pre = clf.named_steps['preprocess']
    trained_model = clf.named_steps['model']
    feat_names = get_feature_names(trained_pre, numeric_cols, categorical_cols)
    importances = trained_model.feature_importances_

    imp_df = pd.DataFrame({'feature': feat_names, 'importance': importances}).sort_values('importance', ascending=False)
    print("\nTop 15 features:")
    print(imp_df.head(15))

    fig = px.bar(imp_df.head(20).sort_values('importance'),
                 x='importance', y='feature', orientation='h',
                 title='Top 20 Feature Importances (XGBoost)')
    fig.update_layout(yaxis={'categoryorder':'total ascending'})
    fig.show()



=== Classification Section ===
CV PR-AUC: 0.956 ± 0.011
CV ROC-AUC: 0.824 ± 0.038

-- Default threshold (0.5) --
ROC-AUC: 0.8069784867217107
PR-AUC: 0.9441452538526781
              precision    recall  f1-score   support

           0      0.804     0.414     0.547       198
           1      0.923     0.986     0.953      1402

    accuracy                          0.915      1600
   macro avg      0.863     0.700     0.750      1600
weighted avg      0.908     0.915     0.903      1600

Confusion matrix:
 [[  82  116]
 [  20 1382]]



Best F1 threshold: 0.552

-- Threshold=0.552 --
              precision    recall  f1-score   support

           0      0.779     0.480     0.594       198
           1      0.930     0.981     0.955      1402

    accuracy                          0.919      1600
   macro avg      0.854     0.730     0.774      1600
weighted avg      0.912     0.919     0.910      1600

Confusion matrix:
 [[  95  103]
 [  27 1375]]

Recall-oriented threshold (≥0.95 recall target): 0.004

-- Threshold=0.004 --
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       198
           1      0.876     1.000     0.934      1402

    accuracy                          0.876      1600
   macro avg      0.438     0.500     0.467      1600
weighted avg      0.768     0.876     0.818      1600

Confusion matrix:
 [[   0  198]
 [   0 1402]]

Top 15 features:
                         feature  importance
22       CampaignType_Conversion    0.133126
17      CampaignCh


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [6]:
#  Uplift Modeling
if RUN_UPLIFT:
    print("\n=== Uplift Modeling Section ===")

    # Try to find a treatment-like column by common names
    candidates = [c for c in df.columns if c.lower() in
                  ['treatment','treated','exposed','sawad','assignedgroup','variant','email_sent','campaign_assigned']]
    print("Possible treatment-like columns:", candidates)

    # Choose real treatment if available; else simulate randomized treatment
    treatment_col = None  # set to actual name if exists, e.g., 'Exposed'
    if treatment_col is not None and treatment_col in df.columns:
        df['Treatment'] = df[treatment_col].astype(int)
        print(f"Using real treatment column: {treatment_col}")
    else:
        df['Treatment'] = np.random.binomial(1, 0.5, size=len(df))
        print("No real treatment column provided; simulated Treatment (50/50) for demo.")

    # Prepare data
    y_u = df['Conversion'].astype(int)
    T_u = df['Treatment'].astype(int)
    drop_cols_u = ['Conversion', 'Treatment'] + ([id_col] if id_col in df.columns else [])
    X_u = df.drop(columns=[c for c in drop_cols_u if c in df.columns], errors='ignore')

    num_cols_u = X_u.select_dtypes(include=['int64','float64']).columns.tolist()
    cat_cols_u = X_u.select_dtypes(include=['object','category']).columns.tolist()

    print("Treatment rate (1s):", T_u.mean())

    # Split stratified by Treatment to keep balance (uplift is causal only if real randomization)
    X_tr, X_te, y_tr, y_te, T_tr, T_te = train_test_split(
        X_u, y_u, T_u, test_size=0.2, random_state=RANDOM_STATE, stratify=T_u
    )

    # Shared preprocessing
    preprocess_u = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols_u),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols_u),
        ],
        remainder='drop'
    )

    def make_model_u():
        return XGBClassifier(
            random_state=RANDOM_STATE,
            n_estimators=300,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            n_jobs=-1,
            eval_metric='logloss'
        )

    pipe_treat = Pipeline(steps=[('prep', preprocess_u), ('model', make_model_u())])
    pipe_ctrl = Pipeline(steps=[('prep', preprocess_u), ('model', make_model_u())])

    # Fit per arm
    mask_t = (T_tr == 1)
    mask_c = (T_tr == 0)
    pipe_treat.fit(X_tr[mask_t], y_tr[mask_t])
    pipe_ctrl.fit(X_tr[mask_c], y_tr[mask_c])

    # Predict potential outcomes and compute uplift
    p1 = pipe_treat.predict_proba(X_te)[:, 1]
    p0 = pipe_ctrl.predict_proba(X_te)[:, 1]
    uplift = p1 - p0

    res = X_te.copy()
    if id_col in df.columns:
        res[id_col] = df.loc[X_te.index, id_col]
    res['Treatment'] = T_te.values
    res['y_true'] = y_te.values
    res['p_treat'] = p1
    res['p_control'] = p0
    res['uplift'] = uplift

    print("\nUplift summary:")
    print(res['uplift'].describe())

    # Histogram of uplift
    fig = px.histogram(res, x='uplift', nbins=50, title='Uplift score distribution (p_treat - p_control)')
    fig.update_layout(bargap=0.02)
    fig.show()

    # Rank and cumulative uplift
    res_sorted = res.sort_values('uplift', ascending=False).reset_index(drop=True)
    res_sorted['rank'] = np.arange(1, len(res_sorted)+1)
    res_sorted['pct'] = res_sorted['rank'] / len(res_sorted)
    res_sorted['cum_uplift'] = res_sorted['uplift'].cumsum()

    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=res_sorted['pct'], y=res_sorted['cum_uplift'],
                              mode='lines', name='Cumulative uplift'))
    fig2.update_layout(title='Cumulative uplift curve (ranked by predicted uplift)',
                       xaxis_title='Top proportion of population',
                       yaxis_title='Cumulative predicted uplift')
    fig2.show()

    # Simple targeting policy: top 20% by uplift
    top_k = 0.2
    k = max(1, int(len(res_sorted) * top_k))
    top_group = res_sorted.head(k)
    bottom_group = res_sorted.tail(k)
    print(f"Avg uplift top {int(top_k*100)}%:", top_group['uplift'].mean())
    print(f"Avg uplift bottom {int(top_k*100)}%:", bottom_group['uplift'].mean())

    # Save ranked list for action
    res_sorted.to_csv('uplift_ranked_users.csv', index=False)
    print("Saved: uplift_ranked_users.csv")

print("\nAll done. Use the toggles at the top to run classification and/or uplift.")


=== Uplift Modeling Section ===
Possible treatment-like columns: []
No real treatment column provided; simulated Treatment (50/50) for demo.
Treatment rate (1s): 0.49425

Uplift summary:
count    1600.000000
mean        0.003877
std         0.131503
min        -0.727185
25%        -0.016456
50%         0.002784
75%         0.023343
max         0.768665
Name: uplift, dtype: float64


Avg uplift top 20%: 0.15889284
Avg uplift bottom 20%: -0.14983171
Saved: uplift_ranked_users.csv

All done. Use the toggles at the top to run classification and/or uplift.


CONCLUSIONS


*   Built an end-to-end conversion classifier with strong test performance (PR-AUC ≈ 0.94, ROC-AUC ≈ 0.81) and threshold tuning (best F1 ≈ 0.552) for actionable operating points.

*   Key drivers identified (e.g., CampaignType_Conversion, CampaignChannel, Gender), informing channel and creative strategy.

*   Uplift demo (simulated treatment) shows clear segmentation: target top uplift users, avoid bottom segment to improve ROI.

*   Next: replace simulated treatment with real exposure, validate uplift (Qini/AUUC), calibrate probabilities, and deploy the saved pipeline for batch scoring.



