In [1]:
import pandas as pd

# Load the datasets
email_df = pd.read_csv("C:/Users/satyam kamal/Downloads/email_table.csv")
opened_df = pd.read_csv("C:/Users/satyam kamal/Downloads/email_opened_table.csv")
clicked_df = pd.read_csv("C:/Users/satyam kamal/Downloads/link_clicked_table.csv")

# Display the first few rows of each dataframe
email_df.head(), opened_df.head(), clicked_df.head()

(   email_id   email_text email_version  hour    weekday user_country  \
 0     85120  short_email  personalized     2     Sunday           US   
 1    966622   long_email  personalized    12     Sunday           UK   
 2    777221   long_email  personalized    11  Wednesday           US   
 3    493711  short_email       generic     6     Monday           UK   
 4    106887   long_email       generic    14     Monday           US   
 
    user_past_purchases  
 0                    5  
 1                    2  
 2                    2  
 3                    1  
 4                    6  ,
    email_id
 0    284534
 1    609056
 2    220820
 3    905936
 4    164034,
    email_id
 0    609056
 1    870980
 2    935124
 3    158501
 4    177561)

In [3]:
# Merge opened and clicked flags with the main email data
email_df["opened"] = email_df["email_id"].isin(opened_df["email_id"]).astype(int)
email_df["clicked"] = email_df["email_id"].isin(clicked_df["email_id"]).astype(int)

# Calculate performance metrics
total_emails = len(email_df)
opened_percentage = email_df["opened"].mean() * 100
clicked_percentage = email_df["clicked"].mean() * 100

opened_percentage, clicked_percentage

(10.345, 2.119)

In [5]:


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
df = email_df.copy()
label_encoders = {}
for col in ["email_text", "email_version", "weekday", "user_country"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
features = ["email_text", "email_version", "hour", "weekday", "user_country", "user_past_purchases"]
X = df[features]
y = df["clicked"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred, output_dict=True)
roc_auc = roc_auc_score(y_test, y_prob)

report, roc_auc

({'0': {'precision': 0.9789304705528243,
   'recall': 0.9968328565590518,
   'f1-score': 0.9878005568210579,
   'support': 19576.0},
  '1': {'precision': 0.06060606060606061,
   'recall': 0.009433962264150943,
   'f1-score': 0.0163265306122449,
   'support': 424.0},
  'accuracy': 0.9759,
  'macro avg': {'precision': 0.5197682655794424,
   'recall': 0.5031334094116015,
   'f1-score': 0.5020635437166514,
   'support': 20000.0},
  'weighted avg': {'precision': 0.9594619930619529,
   'recall': 0.9759,
   'f1-score': 0.9672053074654312,
   'support': 20000.0}},
 0.5775243535596148)

In [7]:
# Add predicted probabilities to the full dataset
df["predicted_click_proba"] = model.predict_proba(X)[:, 1]

# Simulate sending emails to top 20% most likely to click
top_20_cutoff = df["predicted_click_proba"].quantile(0.80)
df["model_selected"] = df["predicted_click_proba"] >= top_20_cutoff

# Compare actual click-through rates
original_ctr = df["clicked"].mean()
model_ctr = df[df["model_selected"]]["clicked"].mean()

original_ctr, model_ctr

(0.02119, 0.09120971324218943)

In [13]:
from nbformat import v4 as nbf
from pathlib import Path

# Create notebook cells
cells = []

# Intro markdown
cells.append(nbf.new_markdown_cell("# 📧 Email Marketing Campaign Analysis\nThis notebook analyzes an email marketing campaign and builds a model to optimize link click-through rates."))

# Load libraries and data
cells.append(nbf.new_code_cell("""\
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Load data
email_df = pd.read_csv("email_table.csv")
opened_df = pd.read_csv("email_opened_table.csv")
clicked_df = pd.read_csv("link_clicked_table.csv")

# Merge open and click labels
email_df["opened"] = email_df["email_id"].isin(opened_df["email_id"]).astype(int)
email_df["clicked"] = email_df["email_id"].isin(clicked_df["email_id"]).astype(int)

# Basic performance
opened_percentage = email_df["opened"].mean() * 100
clicked_percentage = email_df["clicked"].mean() * 100
print(f"Open Rate: {opened_percentage:.2f}%\\nClick Rate: {clicked_percentage:.2f}%")
"""))

# Modeling
cells.append(nbf.new_code_cell("""\
# Encode categorical variables
df = email_df.copy()
label_encoders = {}
for col in ["email_text", "email_version", "weekday", "user_country"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature selection
features = ["email_text", "email_version", "hour", "weekday", "user_country", "user_past_purchases"]
X = df[features]
y = df["clicked"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
print(report)
print(f"ROC AUC Score: {roc_auc:.3f}")
"""))

# Uplift estimation
cells.append(nbf.new_code_cell("""\
# Predict on full set
df["predicted_click_proba"] = model.predict_proba(X)[:, 1]

# Select top 20% likely clickers
top_20_cutoff = df["predicted_click_proba"].quantile(0.80)
df["model_selected"] = df["predicted_click_proba"] >= top_20_cutoff

# Compare CTR
original_ctr = df["clicked"].mean()
model_ctr = df[df["model_selected"]]["clicked"].mean()

print(f"Original CTR: {original_ctr:.4f}\\nCTR using model targeting (top 20%): {model_ctr:.4f}")
"""))

# Create and save notebook
nb = nbf.new_notebook(cells=cells)
output_path = "C:/Users/satyam kamal/Documents/EmailMarketingCompaign.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(nbf.writes(nb))

output_path


'C:/Users/satyam kamal/Documents/EmailMarketingCompaign.ipynb'