# A-B-Testing-in-Business-Contexts

## Exercise 1 – Sample Size Planning

In [1]:
# ==========================================
# EXERCISE 1 - Sample Size Planning
# ==========================================

import numpy as np
from scipy import stats
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize

# Baseline and expected improvement
p1 = 0.12          # current conversion
p2 = 0.135         # expected conversion
alpha = 0.05
power = 0.80

# ------------------------------------------
# 1) Required sample size per group
# ------------------------------------------

# Compute effect size for proportions
effect_size = proportion_effectsize(p1, p2)

analysis = NormalIndPower()
sample_size = analysis.solve_power(effect_size=effect_size,
                                   power=power,
                                   alpha=alpha,
                                   ratio=1)

sample_size = int(np.ceil(sample_size))
print("Required sample size per group:", sample_size)

# ------------------------------------------
# 2) Days needed with 2000 visitors/day
# 50/50 split → 1000 per group per day
# ------------------------------------------

visitors_per_group_per_day = 1000
days_needed = sample_size / visitors_per_group_per_day

print("Days required:", np.ceil(days_needed))

# ------------------------------------------
# 3) Minimum Detectable Effect in 14 days
# 14 days * 1000 per group
# ------------------------------------------

n_fixed = 14 * 1000

# Work backwards: find smallest detectable p2
# Iterate possible effects

possible_effects = np.linspace(0.005, 0.05, 100)
mde = None

for delta in possible_effects:
    p_test = p1 + delta
    eff = proportion_effectsize(p1, p_test)
    required_n = analysis.solve_power(effect_size=eff,
                                      power=power,
                                      alpha=alpha,
                                      ratio=1)
    if required_n <= n_fixed:
        mde = delta
        break

print("Minimum Detectable Effect (absolute):", mde)
print("New detectable rate:", p1 + mde)

Required sample size per group: 7756
Days required: 8.0
Minimum Detectable Effect (absolute): 0.011363636363636364
New detectable rate: 0.13136363636363635


## Exercise 2 – Analyze Email A/B Test

In [2]:
# ==========================================
# EXERCISE 2 - Email A/B Test
# ==========================================

import pandas as pd
from math import sqrt

email_test = pd.DataFrame({
    'variant': ['Subject A', 'Subject B'],
    'emails_sent': [15000, 15000],
    'opens': [2850, 3150]
})

email_test['open_rate'] = email_test['opens'] / email_test['emails_sent']
print(email_test)

pA = email_test.loc[0, 'open_rate']
pB = email_test.loc[1, 'open_rate']

# ------------------------------------------
# Absolute & Relative Difference
# ------------------------------------------

abs_diff = pB - pA
rel_diff = abs_diff / pA

print("Absolute difference:", abs_diff)
print("Relative difference:", rel_diff)

# ------------------------------------------
# Two-proportion z-test
# ------------------------------------------

nA = nB = 15000
xA = 2850
xB = 3150

p_pool = (xA + xB) / (nA + nB)
se = sqrt(p_pool * (1 - p_pool) * (1/nA + 1/nB))

z_stat = (pB - pA) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

print("Z-statistic:", z_stat)
print("P-value:", p_value)

# ------------------------------------------
# 95% CI for difference
# ------------------------------------------

z_crit = stats.norm.ppf(0.975)
se_diff = sqrt(pA*(1-pA)/nA + pB*(1-pB)/nB)

ci = (abs_diff - z_crit*se_diff,
      abs_diff + z_crit*se_diff)

print("95% CI for difference:", ci)

# Recommendation
if p_value < 0.05 and abs_diff > 0:
    print("Recommendation: Switch to Subject B.")
else:
    print("Recommendation: No strong evidence to switch.")

     variant  emails_sent  opens  open_rate
0  Subject A        15000   2850       0.19
1  Subject B        15000   3150       0.21
Absolute difference: 0.01999999999999999
Relative difference: 0.1052631578947368
Z-statistic: 4.330127018922191
P-value: 1.490233579248823e-05
95% CI for difference: (np.float64(0.010950143533889671), np.float64(0.02904985646611031))
Recommendation: Switch to Subject B.


## Exercise 3 – Full Landing Page A/B Test

In [4]:
# ==========================================
# EXERCISE 3 - Full A/B Analysis (CORRECTED)
# ==========================================

import pandas as pd
import numpy as np
from scipy import stats   # <-- manquait dans ton code

# ------------------------------------------
# 1️⃣ Data Generation (IMPORTANT)
# ------------------------------------------

np.random.seed(123)

landing_test = pd.DataFrame({
    'user_id': range(1, 20001),
    'variant': np.random.choice(['original', 'new_design'], 20000),
    'device': np.random.choice(['mobile', 'desktop'], 20000, p=[0.6, 0.4]),
})

# Conversion logic
def get_conversion(row):
    if row['variant'] == 'original':
        rate = 0.05 if row['device'] == 'mobile' else 0.08
    else:
        rate = 0.055 if row['device'] == 'mobile' else 0.095
    return np.random.binomial(1, rate)

landing_test['converted'] = landing_test.apply(get_conversion, axis=1)

# ------------------------------------------
# 2️⃣ Overall Conversion Analysis
# ------------------------------------------

overall = landing_test.groupby('variant')['converted'].agg(['mean','sum','count'])
print("Overall Conversion Rates")
print(overall)

orig = overall.loc['original']
new = overall.loc['new_design']

p1 = orig['mean']
p2 = new['mean']
n1 = orig['count']
n2 = new['count']
x1 = orig['sum']
x2 = new['sum']

# Pooled proportion
p_pool = (x1 + x2) / (n1 + n2)

# Standard error
se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))

# Z-statistic
z_stat = (p2 - p1) / se

# One-tailed test (H1: new_design > original)
p_value = 1 - stats.norm.cdf(z_stat)

print("\nOverall z-stat:", z_stat)
print("Overall p-value:", p_value)

# ------------------------------------------
# 3️⃣ Breakdown by Device
# ------------------------------------------

device_breakdown = landing_test.groupby(['variant','device'])['converted'].mean()
print("\nConversion by Device")
print(device_breakdown)

# ------------------------------------------
# 4️⃣ Separate Significance Tests by Device
# ------------------------------------------

for device in ['mobile','desktop']:
    
    subset = landing_test[landing_test['device'] == device]
    group = subset.groupby('variant')['converted'].agg(['mean','sum','count'])
    
    p1 = group.loc['original','mean']
    p2 = group.loc['new_design','mean']
    n1 = group.loc['original','count']
    n2 = group.loc['new_design','count']
    x1 = group.loc['original','sum']
    x2 = group.loc['new_design','sum']
    
    p_pool = (x1 + x2) / (n1 + n2)
    se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))
    
    z_stat = (p2 - p1) / se
    p_val = 1 - stats.norm.cdf(z_stat)  # one-tailed
    
    print(f"\nDevice: {device}")
    print("Conversion Original:", p1)
    print("Conversion New:", p2)
    print("z-stat:", z_stat)
    print("p-value:", p_val)

Overall Conversion Rates
                mean  sum  count
variant                         
new_design  0.073220  727   9929
original    0.063152  636  10071

Overall z-stat: 2.825020306692341
Overall p-value: 0.002363880982977573

Conversion by Device
variant     device 
new_design  desktop    0.096936
            mobile     0.057340
original    desktop    0.080529
            mobile     0.051650
Name: converted, dtype: float64

Device: mobile
Conversion Original: 0.05165016501650165
Conversion New: 0.057339835211030775
z-stat: 1.373553046576855
p-value: 0.08479024107394617

Device: desktop
Conversion Original: 0.08052854649713288
Conversion New: 0.09693621295831241
z-stat: 2.5797082833831837
p-value: 0.004944190366478862


## Exercise 4 – Continuous Metric A/B Test

In [6]:
# ==========================================
# EXERCISE 4 - Continuous Metric A/B Test (CORRECTED)
# ==========================================

import numpy as np
from scipy import stats

# ------------------------------------------
# 1️⃣ Data Generation
# ------------------------------------------

np.random.seed(456)

# Control group
control_aov = np.random.normal(loc=85, scale=35, size=1000)
control_aov = np.clip(control_aov, 10, 300)

# Treatment group
treatment_aov = np.random.normal(loc=92, scale=38, size=1000)
treatment_aov = np.clip(treatment_aov, 10, 300)

print("Control Mean:", control_aov.mean())
print("Treatment Mean:", treatment_aov.mean())

# ------------------------------------------
# 2️⃣ Two-Sample T-Test (Welch version)
# ------------------------------------------
# Welch test is safer (does not assume equal variances)

t_stat, p_value = stats.ttest_ind(treatment_aov,
                                  control_aov,
                                  equal_var=False)

print("\nT-stat:", t_stat)
print("P-value:", p_value)

# ------------------------------------------
# 3️⃣ 95% Confidence Interval (Welch)
# ------------------------------------------

mean_diff = treatment_aov.mean() - control_aov.mean()

s1 = np.var(control_aov, ddof=1)
s2 = np.var(treatment_aov, ddof=1)

n1 = len(control_aov)
n2 = len(treatment_aov)

# Standard error (Welch)
se = np.sqrt(s1/n1 + s2/n2)

# Welch-Satterthwaite degrees of freedom
df = (s1/n1 + s2/n2)**2 / (
    (s1/n1)**2/(n1-1) +
    (s2/n2)**2/(n2-1)
)

t_crit = stats.t.ppf(0.975, df)

ci = (
    mean_diff - t_crit * se,
    mean_diff + t_crit * se
)

print("\nMean Difference:", mean_diff)
print("95% CI:", ci)

# ------------------------------------------
# 4️⃣ Cohen's d Effect Size
# ------------------------------------------

# Pooled standard deviation (for effect size only)
pooled_std = np.sqrt(
    ((n1-1)*s1 + (n2-1)*s2) / (n1+n2-2)
)

cohens_d = mean_diff / pooled_std

print("\nCohen's d:", cohens_d)

# Effect interpretation
if abs(cohens_d) < 0.2:
    interpretation = "Small"
elif abs(cohens_d) < 0.5:
    interpretation = "Medium"
else:
    interpretation = "Large"

print("Effect Size Interpretation:", interpretation)

# ------------------------------------------
# 5️⃣ Revenue Impact Estimation
# ------------------------------------------

monthly_orders = 5000

monthly_impact = mean_diff * monthly_orders
lower_bound = ci[0] * monthly_orders
upper_bound = ci[1] * monthly_orders

print("\nEstimated Monthly Revenue Lift:", monthly_impact)
print("Revenue Impact 95% CI:", (lower_bound, upper_bound))

Control Mean: 85.14062505496433
Treatment Mean: 93.24786578297595

T-stat: 5.1007891807442505
P-value: 3.703699934542496e-07

Mean Difference: 8.107240728011618
95% CI: (np.float64(4.990159461676418), np.float64(11.224321994346818))

Cohen's d: 0.22811422694079211
Effect Size Interpretation: Medium

Estimated Monthly Revenue Lift: 40536.20364005809
Revenue Impact 95% CI: (np.float64(24950.79730838209), np.float64(56121.60997173409))


## Exercise 5 – Multiple Variants (A/B/C Test)

In [7]:
# ==========================================
# EXERCISE 5 - Multiple Variants
# ==========================================

from scipy.stats import chi2_contingency

button_test = pd.DataFrame({
    'variant': ['Blue', 'Green', 'Orange'],
    'visitors': [10000, 10000, 10000],
    'clicks': [320, 380, 345]
})

button_test['non_clicks'] = button_test['visitors'] - button_test['clicks']

print(button_test)

# ------------------------------------------
# Chi-squared test
# ------------------------------------------

contingency = button_test[['clicks','non_clicks']].values
chi2, p_value, _, _ = chi2_contingency(contingency)

print("Chi-square p-value:", p_value)

# ------------------------------------------
# Green vs Blue (Bonferroni correction)
# ------------------------------------------

alpha_corrected = 0.05 / 2  # comparing 2 pairs

blue = button_test.iloc[0]
green = button_test.iloc[1]

p1 = blue['clicks']/blue['visitors']
p2 = green['clicks']/green['visitors']

p_pool = (blue['clicks']+green['clicks'])/(blue['visitors']+green['visitors'])
se = np.sqrt(p_pool*(1-p_pool)*(1/10000+1/10000))

z_stat = (p2-p1)/se
p_val = 2*(1-stats.norm.cdf(abs(z_stat)))

print("Green vs Blue p-value:", p_val)
print("Bonferroni alpha:", alpha_corrected)

if p_val < alpha_corrected:
    print("Green significantly better than Blue.")
else:
    print("No significant difference after correction.")

# Recommendation
print("\nRecommendation:")
print("If statistically significant and highest CTR → choose Green.")

  variant  visitors  clicks  non_clicks
0    Blue     10000     320        9680
1   Green     10000     380        9620
2  Orange     10000     345        9655
Chi-square p-value: 0.06708686177804175
Green vs Blue p-value: 0.020968775324901268
Bonferroni alpha: 0.025
Green significantly better than Blue.

Recommendation:
If statistically significant and highest CTR → choose Green.
