In [2]:
# 📧 A/B Testing Analysis: Email Subject Line Impact on Subscription

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
from scipy import stats

# Upload the raw dataset
from google.colab import files
uploaded = files.upload()

Saving email_ab_test.csv to email_ab_test.csv


In [3]:
# Step 1: Load the dataset
df = pd.read_csv("email_ab_test.csv")
print("Data shape:", df.shape)
df.head()



Data shape: (10000, 9)


Unnamed: 0,user_id,group,device,age_group,source,location,opened,clicked,subscribed
0,1,control,mobile,25-34,organic,US,False,False,False
1,2,treatment,mobile,35-44,organic,UK,False,False,False
2,3,control,desktop,35-44,ad,UK,False,False,False
3,4,control,mobile,18-24,referral,US,False,False,False
4,5,control,desktop,18-24,ad,India,True,False,False


In [4]:
# Step 2: Conversion Funnel Breakdown

# Overall metrics
total_users = df.shape[0]
opened_rate = df['opened'].mean()
clicked_rate = df['clicked'].mean()
subscribed_rate = df['subscribed'].mean()

print(f"Opened Rate: {opened_rate:.2%}")
print(f"Click-Through Rate: {clicked_rate:.2%}")
print(f"Subscription Rate: {subscribed_rate:.2%}")

# Grouped funnel comparison
funnel_metrics = df.groupby('group')[['opened', 'clicked', 'subscribed']].mean()
print("\nFunnel Metrics by Group:")
print(funnel_metrics)



Opened Rate: 33.65%
Click-Through Rate: 7.40%
Subscription Rate: 0.92%

Funnel Metrics by Group:
             opened   clicked  subscribed
group                                    
control    0.301815  0.060243    0.006184
treatment  0.371366  0.087828    0.012232


In [5]:
# Step 3: Z-Test for Group Differences

def run_ztest(metric):
    successes = df.groupby('group')[metric].sum().values
    n_obs = df.groupby('group')[metric].count().values
    zstat, pval = proportions_ztest(successes, n_obs)
    print(f"\nZ-Test for {metric.title()}:")
    print(f"Z-Statistic = {zstat:.4f}, P-Value = {pval:.4f}")

for metric in ['opened', 'clicked', 'subscribed']:
    run_ztest(metric)




Z-Test for Opened:
Z-Statistic = -7.3596, P-Value = 0.0000

Z-Test for Clicked:
Z-Statistic = -5.2689, P-Value = 0.0000

Z-Test for Subscribed:
Z-Statistic = -3.1673, P-Value = 0.0015


In [6]:
# Step 4: Confidence Intervals

def proportion_confint(success, total, alpha=0.05):
    p = success / total
    z = stats.norm.ppf(1 - alpha / 2)
    se = np.sqrt(p * (1 - p) / total)
    return (p - z * se, p + z * se)

print("\n95% Confidence Intervals:")
for metric in ['opened', 'clicked', 'subscribed']:
    for group in ['control', 'treatment']:
        total = df[df['group'] == group].shape[0]
        success = df[df['group'] == group][metric].sum()
        ci_low, ci_high = proportion_confint(success, total)
        print(f"{group.title()} {metric.title()} CI: ({ci_low:.3f}, {ci_high:.3f})")



95% Confidence Intervals:
Control Opened CI: (0.289, 0.315)
Treatment Opened CI: (0.358, 0.385)
Control Clicked CI: (0.054, 0.067)
Treatment Clicked CI: (0.080, 0.096)
Control Subscribed CI: (0.004, 0.008)
Treatment Subscribed CI: (0.009, 0.015)


In [10]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Step 5: Logistic Regression for Subscription Likelihood

# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['group', 'device', 'age_group', 'source'], drop_first=True)

# Ensure all predictors are valid names for Patsy
df_encoded.columns = df_encoded.columns.str.replace('-', '_')
df_encoded.columns = df_encoded.columns.str.replace('+', '_plus_')

# Convert dependent variable to integer
df_encoded['subscribed'] = df_encoded['subscribed'].astype(int)

# Define logistic regression formula
predictors = [col for col in df_encoded.columns if col not in ['user_id', 'location', 'subscribed']]
formula = 'subscribed ~ ' + ' + '.join(predictors)

# Fit logistic regression model
logit_model = smf.logit(formula=formula, data=df_encoded).fit()
print("\nLogistic Regression Results:\n")
print(logit_model.summary())


         Current function value: 0.027411
         Iterations: 35

Logistic Regression Results:

                           Logit Regression Results                           
Dep. Variable:             subscribed   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9988
Method:                           MLE   Df Model:                           11
Date:                Thu, 03 Apr 2025   Pseudo R-squ.:                  0.4758
Time:                        06:37:20   Log-Likelihood:                -274.11
converged:                      False   LL-Null:                       -522.92
Covariance Type:            nonrobust   LLR p-value:                 1.026e-99
                                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                    -26.0380   4797.595     -0.005      0.996   -9429.15



In [11]:
# Step 6: Segmented Funnel Analysis
print("\nSegmented Funnel by Device:")
print(df.groupby(['device', 'group'])[['opened', 'clicked', 'subscribed']].mean())

print("\nSegmented Funnel by Age Group:")
print(df.groupby(['age_group', 'group'])[['opened', 'clicked', 'subscribed']].mean())


Segmented Funnel by Device:
                     opened   clicked  subscribed
device  group                                    
desktop control    0.291956  0.058946    0.007628
        treatment  0.354195  0.084594    0.010316
mobile  control    0.304107  0.061278    0.006193
        treatment  0.374056  0.090640    0.013465
tablet  control    0.316103  0.057654    0.001988
        treatment  0.405738  0.079918    0.010246

Segmented Funnel by Age Group:
                       opened   clicked  subscribed
age_group group                                    
18-24     control    0.282051  0.055385    0.007179
          treatment  0.348291  0.088675    0.009615
25-34     control    0.310321  0.065619    0.006835
          treatment  0.381178  0.100880    0.010156
35-44     control    0.302198  0.062794    0.004710
          treatment  0.384096  0.081020    0.013503
45-54     control    0.303665  0.049738    0.006545
          treatment  0.364512  0.075653    0.016506
55+       control  

In [12]:
# Step 7: Interaction Effects in Regression
interaction_formula = 'subscribed ~ group_treatment * device_mobile + group_treatment * age_group_25_34 + group_treatment * age_group_35_44'
interaction_model = smf.logit(formula=interaction_formula, data=df_encoded).fit()
print("\nLogistic Regression with Interaction Effects:\n")
print(interaction_model.summary())


Optimization terminated successfully.
         Current function value: 0.051657
         Iterations 9

Logistic Regression with Interaction Effects:

                           Logit Regression Results                           
Dep. Variable:             subscribed   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9992
Method:                           MLE   Df Model:                            7
Date:                Thu, 03 Apr 2025   Pseudo R-squ.:                 0.01214
Time:                        06:37:48   Log-Likelihood:                -516.57
converged:                       True   LL-Null:                       -522.92
Covariance Type:            nonrobust   LLR p-value:                   0.07986
                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------