In [2]:
import pandas as pd

# Load the data using raw strings
email_df = pd.read_csv(r'C:\Users\satya\Downloads\Email Marketing Campaign\email (1)\email_table.csv')
opened_df = pd.read_csv(r'C:\Users\satya\Downloads\Email Marketing Campaign\email (1)\email_opened_table.csv')
clicked_df = pd.read_csv(r'C:\Users\satya\Downloads\Email Marketing Campaign\email (1)\link_clicked_table.csv')

# Add binary columns
email_df['email_opened'] = email_df['email_id'].isin(opened_df['email_id']).astype(int)
email_df['link_clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)


In [3]:
total_emails = len(email_df)
open_rate = email_df['email_opened'].sum() / total_emails * 100
click_rate = email_df['link_clicked'].sum() / total_emails * 100

print(f"Open Rate: {open_rate:.2f}%")
print(f"Click-Through Rate: {click_rate:.2f}%")


Open Rate: 10.35%
Click-Through Rate: 2.12%


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Feature engineering
df = email_df.copy()
df = pd.get_dummies(df, columns=['email_text', 'email_version', 'weekday', 'user_country'], drop_first=True)

X = df.drop(columns=['email_id', 'email_opened', 'link_clicked'])
y = df['link_clicked']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19576
           1       0.07      0.01      0.02       424

    accuracy                           0.98     20000
   macro avg       0.53      0.50      0.50     20000
weighted avg       0.96      0.98      0.97     20000



In [6]:
# Predict probabilities
df['predicted_proba'] = model.predict_proba(X)[:, 1]
top_df = df.sort_values('predicted_proba', ascending=False).head(int(0.2 * len(df)))  # top 20%

# Estimate CTR in top group
improved_ctr = top_df['link_clicked'].sum() / len(top_df) * 100
print(f"CTR in top 20% group: {improved_ctr:.2f}% vs Original CTR: {click_rate:.2f}%")


CTR in top 20% group: 9.25% vs Original CTR: 2.12%


In [None]:
# CTR by text version
print(email_df.groupby('email_text')['link_clicked'].mean() * 100)

# CTR by personalization
print(email_df.groupby('email_version')['link_clicked'].mean() * 100)

# CTR by weekday
print(email_df.groupby('weekday')['link_clicked'].mean() * 100)

# CTR by user past purchases
email_df['purchase_bucket'] = pd.cut(email_df['user_past_purchases'], bins=[-1,0,1,5,10,100], labels=['0','1','2-5','6-10','10+'])
print(email_df.groupby('purchase_bucket')['link_clicked'].mean() * 100)


email_text
long_email     1.853767
short_email    2.387177
Name: link_clicked, dtype: float64
email_version
generic         1.513673
personalized    2.729409
Name: link_clicked, dtype: float64
weekday
Friday       1.403682
Monday       2.290608
Saturday     1.784611
Sunday       1.675123
Thursday     2.444491
Tuesday      2.488864
Wednesday    2.761999
Name: link_clicked, dtype: float64
purchase_bucket
0       0.050443
1       1.119919
2-5     1.850862
6-10    3.645062
10+     6.903711
Name: link_clicked, dtype: float64


  print(email_df.groupby('purchase_bucket')['link_clicked'].mean() * 100)
