In [1]:
# Setup and load balanced dataset

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

path = "/Users/rohityadav/Desktop/Git Projects/ml-uplift-modeling-criteo/data/criteo-uplift-v2.1-100K-balanced.csv"
df = pd.read_csv(path)

X = df.drop(['treatment', 'visit', 'conversion', 'exposure'], axis = 1)
T = df['treatment']
Y = df['visit']

In [2]:
# Split treated and control

X_treated = X[T == 1]
Y_treated = Y[T == 1]

X_control = X[T == 0]
Y_control = Y[T == 0]

print("X shape: ", X_treated.shape, X_control.shape)
print("Y shape: ", Y_treated.shape, Y_control.shape)

X shape:  (50000, 12) (50000, 12)
Y shape:  (50000,) (50000,)


In [5]:
# Train the two models

model_treated = LogisticRegression(max_iter = 5000)
model_control = LogisticRegression(max_iter = 5000)

model_treated.fit(X_treated, Y_treated)
model_control.fit(X_control, Y_control)

In [6]:
# Predict probablities for everyone

p_treated = model_treated.predict_proba(X)[:,1]
p_control = model_control.predict_proba(X)[:,1]
uplift_scores = p_treated - p_control

In [8]:
# Sanity Checks

print('Mean uplift:', uplift_scores.mean())
print('Min uplift', uplift_scores.min())
print('Max uplift', uplift_scores.max())

Mean uplift: 0.007771289418850309
Min uplift -0.5745141743632811
Max uplift 0.6959824113632934


In [9]:
# Verification - Are we really doing uplift modeling? Part 1

np.mean(uplift_scores > 0), np.mean(uplift_scores < 0)

(np.float64(0.77751), np.float64(0.22249))

In [17]:
# Verification - Are we really doing uplift modeling? Part 2

print('AUC treated:', roc_auc_score(Y_treated, model_treated.predict_proba(X_treated)[:,1]))
print('AUC control:', roc_auc_score(Y_control, model_control.predict_proba(X_control)[:,1]))

AUC treated: 0.9320066594321109
AUC control: 0.9273860661645743


The roc_auc_score should be in the range of 0.52 to 0.72. The roc_auc_score is coming up to 0.93 in my case, which suggests that there is something wrong. The issue lies that I evaluated on the same data that I trained the model on. Let me correct that going further:

In [20]:
# Stratified split by treatment

from sklearn.model_selection import train_test_split

X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(
    X, T, Y,
    test_size = 0.3,
    stratify = T,
    random_state = 42
)

print("\nTrain shapes:", X_train.shape, T_train.shape, Y_train.shape)
print("Test shapes:", X_test.shape, T_test.shape, Y_test.shape)

print("\nTreatment ratio train:", T_train.mean())
print("\nTreatment ratio test:", T_test.mean())


Train shapes: (70000, 12) (70000,) (70000,)
Test shapes: (30000, 12) (30000,) (30000,)

Treatment ratio train: 0.5

Treatment ratio test: 0.5


In [21]:
# Let us run the tests that we ran on train dataset on the test dataset

# Split train by treatment

X_tr_treated = X_train[T_train == 1]
Y_tr_treated = Y_train[T_train == 1]

X_tr_control = X_train[T_train == 0]
Y_tr_control = Y_train[T_train == 0]

In [24]:
# Train the model

model_treated = LogisticRegression(max_iter = 5000)
model_control = LogisticRegression(max_iter = 5000)

model_treated.fit(X_tr_treated, Y_tr_treated)
model_control.fit(X_tr_control, Y_tr_control)

In [33]:
# Predict only on the test dataset

p_treated_test = model_treated.predict_proba(X_test)[:, 1]
p_control_test = model_control.predict_proba(X_test)[:, 1]

uplift_scores_test = p_treated_test - p_control_test

In [34]:
# Sanity Checks for the new scores

print('Uplift mean:', uplift_scores.mean())
print('Uplift min:', uplift_scores.min())
print('Uplift max:', uplift_scores.max())

Uplift mean: 0.007771289418850309
Uplift min: -0.5745141743632811
Uplift max: 0.6959824113632934


In [35]:
# Validation Checks - Are we really doing uplift modeling? Part 1

np.mean(uplift_scores > 0), np.mean(uplift_scores < 0)

(np.float64(0.77751), np.float64(0.22249))

In [38]:
# Validation Checks - Are we really doing uplift modeling? Part 2

print('AUC treated test:', roc_auc_score(Y_test[T_test == 1], model_treated.predict_proba(X_test[T_test == 1])[:, 1]))
print('AUC control test:', roc_auc_score(Y_test[T_test == 0], model_control.predict_proba(X_test[T_test == 0])[:, 1]))

AUC treated test: 0.9307333570818851
AUC control test: 0.9190918459316884


The roc_auc_score is in the similar range for the test dataset as well. So, I can rule out the leakage suspicion.

**This is a large-scale ad-tech dataset with highly engineered behavioral features.
High AUC reflects strong ranking power for click prediction, but uplift estimation remains challenging because it depends on estimating differences between treated and control outcomes. That’s why we evaluate uplift using Qini curves rather than classification metrics.**