In [1]:
# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

# Models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
np.random.seed(42)

n = 50000  # rows

data = pd.DataFrame({
    "user_id": np.random.randint(1, 10000, n),
    "restaurant_id": np.random.randint(1, 2000, n),
    "distance_km": np.random.uniform(0.5, 10, n),
    "rating": np.random.uniform(3.0, 5.0, n),
    "price_range": np.random.choice(["Low", "Medium", "High"], n),
    "time_of_day": np.random.choice(["Breakfast", "Lunch", "Dinner"], n),
    "cuisine_match": np.random.choice([0, 1], n, p=[0.4, 0.6]),
    "discount": np.random.uniform(0, 50, n),
    "bid_amount": np.random.uniform(5, 50, n)
})


In [3]:
# Click probability logic
click_prob = (
    0.3 * data["cuisine_match"] +
    0.2 * (data["rating"] / 5) +
    0.2 * (data["discount"] / 50) -
    0.1 * (data["distance_km"] / 10)
)

click_prob = np.clip(click_prob, 0, 1)

data["clicked"] = np.random.binomial(1, click_prob)


In [4]:
data.head()

Unnamed: 0,user_id,restaurant_id,distance_km,rating,price_range,time_of_day,cuisine_match,discount,bid_amount,clicked
0,7271,588,1.435299,4.186243,Low,Breakfast,0,8.660616,36.793735,0
1,861,7,5.362091,4.550685,Medium,Lunch,0,5.035726,35.566438,0
2,5391,698,2.126229,3.014766,High,Lunch,1,30.781301,27.59202,1
3,5192,1300,1.192843,4.440043,Medium,Breakfast,1,49.219836,42.624852,0
4,5735,1615,9.666484,3.448164,Low,Dinner,0,7.528558,15.89847,0


In [5]:
le = LabelEncoder()
data["price_range"] = le.fit_transform(data["price_range"])
data["time_of_day"] = le.fit_transform(data["time_of_day"])

features = [
    "distance_km", "rating", "price_range",
    "time_of_day", "cuisine_match", "discount", "bid_amount"
]

X = data[features]
y = data["clicked"]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

lr_preds = lr.predict_proba(X_test)[:,1]

print("Logistic AUC:", roc_auc_score(y_test, lr_preds))


Logistic AUC: 0.691371538306044


In [8]:
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)

xgb_preds = xgb.predict_proba(X_test)[:,1]

print("XGBoost AUC:", roc_auc_score(y_test, xgb_preds))


XGBoost AUC: 0.6791823748399705


In [9]:
test_data = X_test.copy()
test_data["predicted_ctr"] = xgb_preds
test_data["bid_amount"] = X_test["bid_amount"]

test_data["ranking_score"] = (
    test_data["predicted_ctr"] * test_data["bid_amount"]
)

ranked_ads = test_data.sort_values(
    "ranking_score", ascending=False
)
ranked_ads.head()


Unnamed: 0,distance_km,rating,price_range,time_of_day,cuisine_match,discount,bid_amount,predicted_ctr,ranking_score
15951,4.071947,3.414892,0,0,1,49.963504,46.850451,0.803116,37.626357
44659,1.413564,3.311836,2,2,1,23.015606,49.652089,0.738939,36.689877
48425,0.954401,4.950613,1,1,1,40.045077,49.29556,0.744082,36.67995
15041,9.126973,4.917148,1,1,1,48.318904,47.355103,0.75344,35.679247
45940,3.467041,3.430193,1,0,1,49.50776,47.236009,0.753496,35.592158


In [10]:
def precision_at_k(df, y_true, k=5):
    top_k = df.sort_values("ranking_score", ascending=False).head(k)
    return y_true.loc[top_k.index].mean()

precision_at_5 = precision_at_k(ranked_ads, y_test, 5)
precision_at_5


np.float64(0.6)

In [11]:
# Control: random ranking
random_rank = test_data.sample(frac=1)

control_ctr = y_test.loc[random_rank.head(5).index].mean()
treatment_ctr = y_test.loc[ranked_ads.head(5).index].mean()

print("Control CTR:", control_ctr)
print("Treatment CTR:", treatment_ctr)


Control CTR: 0.0
Treatment CTR: 0.6


In [13]:
data.to_csv("swiggy_ads_dataset.csv", index=False)
