In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [20]:
df = pd.read_csv("data/online_shoppers_intention.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (12330, 18)
Columns: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue']


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [21]:
# Revenue is True/False -> 1/0
df["Revenue"] = df["Revenue"].map({True: 1, False: 0}).astype(int)

print("Base conversion rate (Revenue=1):", round(df["Revenue"].mean(), 4))
df["Revenue"].value_counts(normalize=True)


Base conversion rate (Revenue=1): 0.1547


Revenue
0    0.845255
1    0.154745
Name: proportion, dtype: float64

In [22]:
X = df.drop(columns=["Revenue"])
y = df["Revenue"]

# NOTE: Some integer-coded IDs (Browser, Region, OS, TrafficType) behave like categories.
# We'll treat them as categorical to avoid fake numeric ordering.

# Force these ID-like columns to categorical (IMPORTANT)
id_like = ["OperatingSystems", "Browser", "Region", "TrafficType"]
for c in id_like:
    X[c] = X[c].astype(str)

num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", len(num_cols), num_cols)
print("Categorical cols:", len(cat_cols), cat_cols)


Numeric cols: 10 ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
Categorical cols: 7 ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (9864, 17) Test size: (2466, 17)


In [24]:
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])

model = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

model


In [25]:
model.fit(X_train, y_train)

probs = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, probs)

print("Baseline ROC-AUC:", round(auc, 4))


Baseline ROC-AUC: 0.8932


In [26]:
results = pd.DataFrame({"prob": probs, "y": y_test.values}).sort_values("prob", ascending=False)
base_rate = results["y"].mean()

def precision_at_k_frac(df, frac):
    k = int(len(df) * frac)
    return df.head(k)["y"].mean()

def lift(p_at_k, base):
    return p_at_k / base if base > 0 else np.nan

for frac in [0.05, 0.10, 0.20]:
    p = precision_at_k_frac(results, frac)
    print(f"Precision@{int(frac*100)}%:", round(p, 4),
          " | Lift:", round(lift(p, base_rate), 2))

Precision@5%: 0.7967  | Lift: 5.14
Precision@10%: 0.7073  | Lift: 4.57
Precision@20%: 0.5416  | Lift: 3.5


In [27]:
def targeting_table(df, fracs=(0.01, 0.05, 0.10, 0.20, 0.30)):
    rows = []
    base = df["y"].mean()
    n = len(df)
    for f in fracs:
        k = max(1, int(n * f))
        p = df.head(k)["y"].mean()
        rows.append({
            "Top_%": int(f*100),
            "K": k,
            "BaseRate": round(base, 4),
            "Precision@K": round(p, 4),
            "Lift@K": round(p/base, 2),
            "Expected_Conversions": round(p * k, 1)
        })
    return pd.DataFrame(rows)

tbl = targeting_table(results)
tbl

Unnamed: 0,Top_%,K,BaseRate,Precision@K,Lift@K,Expected_Conversions
0,1,24,0.1549,0.8333,5.38,20.0
1,5,123,0.1549,0.7967,5.14,98.0
2,10,246,0.1549,0.7073,4.57,174.0
3,20,493,0.1549,0.5416,3.5,267.0
4,30,739,0.1549,0.4235,2.73,313.0
