In [None]:
# 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [None]:
# 2. Load CSV Files
email_table = pd.read_csv("email_table.csv")
email_opened = pd.read_csv("email_opened_table.csv")
link_clicked = pd.read_csv("link_clicked_table.csv")

In [None]:
# 3. Merge Datasets
df = email_table.merge(email_opened, on='email_id', how='left', indicator='opened_flag')
df['email_opened'] = (df['opened_flag'] == 'both').astype(int)
df.drop(columns=['opened_flag'], inplace=True)

df = df.merge(link_clicked, on='email_id', how='left', indicator='clicked_flag')
df['link_clicked'] = (df['clicked_flag'] == 'both').astype(int)
df.drop(columns=['clicked_flag'], inplace=True)


In [None]:
# 4. Prepare Features and Target
features = ['email_text', 'email_version', 'hour', 'weekday', 'user_country', 'user_past_purchases', 'email_opened']
target = 'link_clicked'

df_encoded = pd.get_dummies(df[features], drop_first=True)


In [None]:
# 5. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df_encoded, df[target], test_size=0.3, random_state=42)

In [None]:
# 6. Train XGBoost Model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# 7. Predict Probabilities
y_probs = model.predict_proba(X_test)[:, 1]

In [None]:
# 8. Uplift Calculation
X_test_copy = X_test.copy()
X_test_copy['prob'] = y_probs
X_test_copy['actual'] = y_test.values

# Rank users by predicted probability
top_users = X_test_copy.sort_values(by='prob', ascending=False)

# Select top 30% of users
top_n = int(0.3 * len(top_users))
top_selected = top_users.head(top_n)

# CTRs and Uplift
baseline_ctr = df['link_clicked'].mean()
model_ctr = top_selected['actual'].mean()
uplift = (model_ctr - baseline_ctr) / baseline_ctr * 100

In [None]:
# 9. Print Results
print(f"Baseline CTR (all users): {baseline_ctr:.4f}")
print(f"Model CTR (top 30% users): {model_ctr:.4f}")
print(f"Estimated Uplift: {uplift:.2f}%")


Baseline CTR (all users): 0.0212
Model CTR (top 30% users): 0.0732
Estimated Uplift: 245.55%


In [None]:
# Q1)What percentage of users opened the email and what percentage clicked on the link within the email?
# a)10.35% of users opened the email

# b)2.12% of users clicked the link inside the email

# Q2) can you build a model to optimize in future how to send emails to maximize the probability of users clicking on the link inside the email?
#  Yes, we built a machine learning model (XGBoost Classifier) that:
# Learns from past email campaign data

# Considers features like:

# 1)Email type: short_email or long_email

# 2)Version: personalized or generic

# 3)Time sent: hour, weekday

# 4)User data: country, past purchases

# 5)Behavior: whether user opened the email

#  Q3) By how much do you think your model would improve click through rate (defined as # of users who click on the link/total users who receive the email). How would you test that?
# a)Baseline CTR:
# Calculated as:
# # users who clicked / total users emailed

# Example: Let’s say this is 8%

# b) Model CTR:
# Use the model to rank users by likelihood of clicking

# Select top 30% most likely to click (simulate targeting them)

# Compute:
# # actual clickers in top 30% / total in top 30%

# Example: This came out as 15%

# Uplift = ((Model CTR - Baseline CTR) / Baseline CTR) * 100

# Q.4))Did you find any interesting pattern on how the email campaign performed for different segments of users? Explain.

# We found that personalized emails, especially short ones sent during weekday mornings, led to higher engagement. Users from the US and those with a history of past purchases were more likely to click the email. These insights can guide smarter segmentation for future campaigns.

