In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("SciPy version:", stats.__version__)

In [None]:
# Generate sample data for A/B testing
# Scenario: Testing two website designs for conversion rates

# Design A (Control Group) - Current website design
n_a = 1000  # Sample size for group A
conversion_rate_a = 0.12  # 12% conversion rate
group_a = np.random.binomial(1, conversion_rate_a, n_a)

# Design B (Treatment Group) - New website design
n_b = 1000  # Sample size for group B
conversion_rate_b = 0.15  # 15% conversion rate (hopefully better)
group_b = np.random.binomial(1, conversion_rate_b, n_b)

# Create DataFrame for easier manipulation
df_a = pd.DataFrame({
    'group': 'A',
    'converted': group_a,
    'user_id': range(1, n_a + 1)
})

df_b = pd.DataFrame({
    'group': 'B',
    'converted': group_b,
    'user_id': range(n_a + 1, n_a + n_b + 1)
})

# Combine datasets
ab_data = pd.concat([df_a, df_b], ignore_index=True)

print("Sample A/B testing data generated:")
print(f"Group A size: {len(df_a)}")
print(f"Group B size: {len(df_b)}")
print(f"Total sample size: {len(ab_data)}")
print("\nFirst 10 rows:")
print(ab_data.head(10))

In [None]:
# Calculate basic statistics for both groups
summary_stats = ab_data.groupby('group')['converted'].agg([
    'count', 'sum', 'mean', 'std'
]).round(4)

summary_stats.columns = ['Sample_Size', 'Conversions', 'Conversion_Rate', 'Std_Dev']
print("A/B Testing Summary Statistics:")
print(summary_stats)

# Calculate the difference in conversion rates
rate_a = summary_stats.loc['A', 'Conversion_Rate']
rate_b = summary_stats.loc['B', 'Conversion_Rate']
difference = rate_b - rate_a

print(f"\nConversion Rate Difference (B - A): {difference:.4f}")
print(f"Percentage Point Improvement: {difference * 100:.2f}%")

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot of conversion rates
conversion_rates = ab_data.groupby('group')['converted'].mean()
axes[0].bar(conversion_rates.index, conversion_rates.values, 
           color=['skyblue', 'lightcoral'], alpha=0.7)
axes[0].set_title('Conversion Rates by Group')
axes[0].set_ylabel('Conversion Rate')
axes[0].set_xlabel('Group')
for i, v in enumerate(conversion_rates.values):
    axes[0].text(i, v + 0.005, f'{v:.3f}', ha='center', va='bottom')

# Count plot of conversions
conversion_counts = ab_data.groupby(['group', 'converted']).size().unstack()
conversion_counts.plot(kind='bar', ax=axes[1], color=['lightgray', 'orange'])
axes[1].set_title('Conversion Counts by Group')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Group')
axes[1].legend(['Not Converted', 'Converted'])
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
print("A/B Testing Hypothesis Framework")
print("=" * 40)
print("Null Hypothesis (H0): μB - μA = 0")
print("  - There is no difference in conversion rates between designs A and B")
print("  - Design B does not perform better than Design A")
print()
print("Alternative Hypothesis (H1): μB - μA > 0")
print("  - Design B has a higher conversion rate than Design A")
print("  - Design B significantly outperforms Design A")
print()
print("Significance Level (α): 0.05")
print("Test Type: One-tailed (right-tailed) test")
print("Statistical Test: Two-sample t-test")

In [None]:
# Extract conversion data for each group
conversions_a = df_a['converted'].values
conversions_b = df_b['converted'].values

print("Data prepared for t-test:")
print(f"Group A conversions shape: {conversions_a.shape}")
print(f"Group B conversions shape: {conversions_b.shape}")
print(f"Group A mean: {np.mean(conversions_a):.4f}")
print(f"Group B mean: {np.mean(conversions_b):.4f}")
print(f"Group A std: {np.std(conversions_a, ddof=1):.4f}")
print(f"Group B std: {np.std(conversions_b, ddof=1):.4f}")

In [None]:
# Perform two-sample t-test
# Using Welch's t-test (unequal variances assumed)
t_statistic, p_value_two_tailed = stats.ttest_ind(
    conversions_b, conversions_a, 
    equal_var=False  # Welch's t-test
)

# For one-tailed test (since we expect B > A)
p_value_one_tailed = p_value_two_tailed / 2

print("Two-Sample T-Test Results:")
print("=" * 30)
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value (two-tailed): {p_value_two_tailed:.6f}")
print(f"P-value (one-tailed): {p_value_one_tailed:.6f}")
print()

# Degrees of freedom calculation for Welch's t-test
n1, n2 = len(conversions_a), len(conversions_b)
s1, s2 = np.std(conversions_a, ddof=1), np.std(conversions_b, ddof=1)

# Welch's degrees of freedom formula
df = ((s1**2/n1 + s2**2/n2)**2) / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
print(f"Degrees of freedom: {df:.2f}")

In [None]:
# Manual t-test calculation for educational purposes
mean_a = np.mean(conversions_a)
mean_b = np.mean(conversions_b)
std_a = np.std(conversions_a, ddof=1)
std_b = np.std(conversions_b, ddof=1)

# Standard error of the difference
se_diff = np.sqrt((std_a**2 / n_a) + (std_b**2 / n_b))

# T-statistic
t_manual = (mean_b - mean_a) / se_diff

print("Manual T-Test Calculation:")
print("=" * 30)
print(f"Mean A: {mean_a:.4f}")
print(f"Mean B: {mean_b:.4f}")
print(f"Difference (B - A): {mean_b - mean_a:.4f}")
print(f"Standard Error of Difference: {se_diff:.4f}")
print(f"T-statistic (manual): {t_manual:.4f}")
print(f"T-statistic (scipy): {t_statistic:.4f}")
print(f"Match: {abs(t_manual - t_statistic) < 0.0001}")

In [None]:
# Calculate critical t-value for α = 0.05
alpha = 0.05
critical_t = stats.t.ppf(1 - alpha, df)

print("P-Value Analysis:")
print("=" * 20)
print(f"Significance level (α): {alpha}")
print(f"Critical t-value: {critical_t:.4f}")
print(f"Calculated t-statistic: {t_statistic:.4f}")
print(f"One-tailed p-value: {p_value_one_tailed:.6f}")
print()

# Decision making
if p_value_one_tailed < alpha:
    decision = "REJECT the null hypothesis"
    conclusion = "Design B significantly outperforms Design A"
else:
    decision = "FAIL TO REJECT the null hypothesis"
    conclusion = "No significant difference between designs"

print(f"Decision: {decision}")
print(f"Conclusion: {conclusion}")
print()

# Effect size calculation (Cohen's d)
pooled_std = np.sqrt(((n_a - 1) * std_a**2 + (n_b - 1) * std_b**2) / (n_a + n_b - 2))
cohens_d = (mean_b - mean_a) / pooled_std

print(f"Effect Size (Cohen's d): {cohens_d:.4f}")
if abs(cohens_d) < 0.2:
    effect_interpretation = "Small effect"
elif abs(cohens_d) < 0.5:
    effect_interpretation = "Medium effect"
else:
    effect_interpretation = "Large effect"
print(f"Effect Size Interpretation: {effect_interpretation}")

In [None]:
# Create t-distribution visualization
x = np.linspace(-4, 4, 1000)
y = stats.t.pdf(x, df)

plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label=f't-distribution (df={df:.1f})')

# Shade the critical region
x_critical = np.linspace(critical_t, 4, 100)
y_critical = stats.t.pdf(x_critical, df)
plt.fill_between(x_critical, y_critical, alpha=0.3, color='red', 
                label=f'Critical region (α={alpha})')

# Mark the calculated t-statistic
plt.axvline(t_statistic, color='green', linestyle='--', linewidth=2,
           label=f'Calculated t-statistic = {t_statistic:.3f}')

# Mark the critical value
plt.axvline(critical_t, color='red', linestyle='--', linewidth=2,
           label=f'Critical value = {critical_t:.3f}')

plt.xlabel('t-value')
plt.ylabel('Probability Density')
plt.title('T-Distribution with Critical Region and Test Statistic')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print interpretation
print(f"Since t-statistic ({t_statistic:.3f}) > critical value ({critical_t:.3f}),")
print(f"we {decision.lower()}")

In [None]:
# Calculate 95% confidence interval for the difference in means
confidence_level = 0.95
alpha_ci = 1 - confidence_level
t_critical_ci = stats.t.ppf(1 - alpha_ci/2, df)

# Difference in means
diff_means = mean_b - mean_a

# Margin of error
margin_error = t_critical_ci * se_diff

# Confidence interval
ci_lower = diff_means - margin_error
ci_upper = diff_means + margin_error

print("95% Confidence Interval for Difference in Conversion Rates:")
print("=" * 60)
print(f"Difference in means (B - A): {diff_means:.4f}")
print(f"Standard error: {se_diff:.4f}")
print(f"t-critical value: {t_critical_ci:.4f}")
print(f"Margin of error: {margin_error:.4f}")
print(f"95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
print()

# Interpretation
if ci_lower > 0:
    ci_interpretation = "The entire CI is above 0, confirming B > A"
elif ci_upper < 0:
    ci_interpretation = "The entire CI is below 0, confirming A > B"
else:
    ci_interpretation = "The CI includes 0, no significant difference"

print(f"Interpretation: {ci_interpretation}")

# Convert to percentage points
print(f"95% CI in percentage points: [{ci_lower*100:.2f}%, {ci_upper*100:.2f}%]")

In [None]:
# Confidence intervals for individual groups
def calculate_proportion_ci(successes, n, confidence_level=0.95):
    """Calculate confidence interval for a proportion using normal approximation"""
    p = successes / n
    alpha = 1 - confidence_level
    z_critical = stats.norm.ppf(1 - alpha/2)
    
    se = np.sqrt(p * (1 - p) / n)
    margin_error = z_critical * se
    
    ci_lower = p - margin_error
    ci_upper = p + margin_error
    
    return p, ci_lower, ci_upper, se

# Calculate for both groups
successes_a = np.sum(conversions_a)
successes_b = np.sum(conversions_b)

p_a, ci_a_lower, ci_a_upper, se_a = calculate_proportion_ci(successes_a, n_a)
p_b, ci_b_lower, ci_b_upper, se_b = calculate_proportion_ci(successes_b, n_b)

print("Individual Group Confidence Intervals:")
print("=" * 40)
print(f"Group A:")
print(f"  Conversion rate: {p_a:.4f}")
print(f"  95% CI: [{ci_a_lower:.4f}, {ci_a_upper:.4f}]")
print(f"  95% CI (percentage): [{ci_a_lower*100:.2f}%, {ci_a_upper*100:.2f}%]")
print()
print(f"Group B:")
print(f"  Conversion rate: {p_b:.4f}")
print(f"  95% CI: [{ci_b_lower:.4f}, {ci_b_upper:.4f}]")
print(f"  95% CI (percentage): [{ci_b_lower*100:.2f}%, {ci_b_upper*100:.2f}%]")

In [None]:
# Visualize confidence intervals
fig, ax = plt.subplots(figsize=(10, 6))

groups = ['Group A', 'Group B']
means = [p_a, p_b]
ci_lowers = [ci_a_lower, ci_b_lower]
ci_uppers = [ci_a_upper, ci_b_upper]

# Plot means with error bars
colors = ['skyblue', 'lightcoral']
for i, (group, mean, ci_low, ci_up, color) in enumerate(zip(groups, means, ci_lowers, ci_uppers, colors)):
    ax.errorbar(i, mean, yerr=[[mean - ci_low], [ci_up - mean]], 
               fmt='o', capsize=10, capthick=2, markersize=8, 
               color=color, ecolor='black', label=f'{group} (95% CI)')
    
    # Add value labels
    ax.text(i, mean + 0.01, f'{mean:.3f}', ha='center', va='bottom', fontweight='bold')

ax.set_xlim(-0.5, 1.5)
ax.set_xticks([0, 1])
ax.set_xticklabels(groups)
ax.set_ylabel('Conversion Rate')
ax.set_title('Conversion Rates with 95% Confidence Intervals')
ax.grid(True, alpha=0.3)
ax.legend()

plt.tight_layout()
plt.show()

# Check for overlapping confidence intervals
overlap = not (ci_a_upper < ci_b_lower or ci_b_upper < ci_a_lower)
print(f"Confidence intervals overlap: {overlap}")
if not overlap:
    print("Non-overlapping CIs suggest significant difference")
else:
    print("Overlapping CIs suggest possible non-significant difference")

In [None]:
# Business impact analysis
print("Business Impact Analysis:")
print("=" * 30)

# Calculate business metrics
total_visitors_a = n_a
total_visitors_b = n_b
conversions_a_count = successes_a
conversions_b_count = successes_b

print(f"Group A (Control):")
print(f"  Visitors: {total_visitors_a:,}")
print(f"  Conversions: {conversions_a_count}")
print(f"  Conversion Rate: {p_a:.2%}")
print()
print(f"Group B (Treatment):")
print(f"  Visitors: {total_visitors_b:,}")
print(f"  Conversions: {conversions_b_count}")
print(f"  Conversion Rate: {p_b:.2%}")
print()

# Calculate lift
lift = (p_b - p_a) / p_a * 100
print(f"Relative Lift: {lift:.1f}%")
print(f"Absolute Lift: {(p_b - p_a)*100:.2f} percentage points")

# Projected impact for larger scale
monthly_visitors = 100000
additional_conversions = monthly_visitors * (p_b - p_a)
print(f"\nProjected Monthly Impact (100K visitors):")
print(f"Additional conversions: {additional_conversions:.0f}")

# Revenue impact (assuming $50 per conversion)
revenue_per_conversion = 50
additional_revenue = additional_conversions * revenue_per_conversion
print(f"Additional monthly revenue: ${additional_revenue:,.0f}")

In [None]:
# Statistical power calculation
from scipy.stats import norm

def calculate_power(n1, n2, p1, p2, alpha=0.05):
    """Calculate statistical power for two-proportion test"""
    # Pooled proportion
    p_pooled = (n1 * p1 + n2 * p2) / (n1 + n2)
    
    # Standard errors
    se_null = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
    se_alt = np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
    
    # Critical value
    z_alpha = norm.ppf(1 - alpha)
    
    # Power calculation
    z_beta = (abs(p2 - p1) - z_alpha * se_null) / se_alt
    power = norm.cdf(z_beta)
    
    return power

power = calculate_power(n_a, n_b, p_a, p_b)
print(f"Statistical Power Analysis:")
print(f"=" * 30)
print(f"Statistical Power: {power:.3f} ({power*100:.1f}%)")

if power >= 0.8:
    power_interpretation = "Good power (≥80%)"
elif power >= 0.7:
    power_interpretation = "Moderate power (70-80%)"
else:
    power_interpretation = "Low power (<70%)"

print(f"Power Interpretation: {power_interpretation}")

In [None]:
# Sample size calculation for future tests
def calculate_sample_size(p1, p2, alpha=0.05, power=0.8):
    """Calculate required sample size for two-proportion test"""
    z_alpha = norm.ppf(1 - alpha/2)
    z_beta = norm.ppf(power)
    
    p_avg = (p1 + p2) / 2
    
    n = (z_alpha * np.sqrt(2 * p_avg * (1 - p_avg)) + 
         z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2)))**2 / (p1 - p2)**2
    
    return int(np.ceil(n))

print("Sample Size Recommendations for Future Tests:")
print("=" * 50)

# Different effect sizes to detect
effect_sizes = [0.01, 0.02, 0.03, 0.05]  # 1%, 2%, 3%, 5% improvement
baseline_rate = 0.12

for effect in effect_sizes:
    new_rate = baseline_rate + effect
    required_n = calculate_sample_size(baseline_rate, new_rate)
    
    print(f"To detect {effect*100:.0f}% improvement ({baseline_rate:.1%} → {new_rate:.1%}):")
    print(f"  Required sample size per group: {required_n:,}")
    print(f"  Total sample size needed: {required_n*2:,}")
    print()


In [None]:
# Comprehensive results summary
print("A/B TEST RESULTS SUMMARY")
print("=" * 50)
print()

print("1. EXPERIMENTAL SETUP:")
print(f"   • Control Group (A): {n_a:,} users")
print(f"   • Treatment Group (B): {n_b:,} users")
print(f"   • Significance Level: {alpha:.2%}")
print(f"   • Test Type: One-tailed (B > A)")
print()

print("2. OBSERVED RESULTS:")
print(f"   • Group A Conversion Rate: {p_a:.2%}")
print(f"   • Group B Conversion Rate: {p_b:.2%}")
print(f"   • Absolute Difference: {(p_b - p_a)*100:.2f} percentage points")
print(f"   • Relative Lift: {lift:.1f}%")
print()

print("3. STATISTICAL ANALYSIS:")
print(f"   • T-statistic: {t_statistic:.4f}")
print(f"   • P-value (one-tailed): {p_value_one_tailed:.6f}")
print(f"   • Critical t-value: {critical_t:.4f}")
print(f"   • Statistical Power: {power:.1%}")
print(f"   • Effect Size (Cohen's d): {cohens_d:.4f}")
print()

print("4. CONFIDENCE INTERVALS:")
print(f"   • Difference 95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
print(f"   • Group A 95% CI: [{ci_a_lower:.4f}, {ci_a_upper:.4f}]")
print(f"   • Group B 95% CI: [{ci_b_lower:.4f}, {ci_b_upper:.4f}]")
print()

print("5. STATISTICAL DECISION:")
if p_value_one_tailed < alpha:
    print(f"   • REJECT H0 (p = {p_value_one_tailed:.6f} < α = {alpha})")
    print("   • Design B significantly outperforms Design A")
    significance_status = "STATISTICALLY SIGNIFICANT"
else:
    print(f"   • FAIL TO REJECT H0 (p = {p_value_one_tailed:.6f} ≥ α = {alpha})")
    print("   • No significant difference detected")
    significance_status = "NOT STATISTICALLY SIGNIFICANT"

print(f"   • Result: {significance_status}")

In [None]:
print("\nBUSINESS RECOMMENDATIONS")
print("=" * 30)

if p_value_one_tailed < alpha:
    print("✅ RECOMMENDATION: IMPLEMENT DESIGN B")
    print()
    print("JUSTIFICATION:")
    print(f"• Statistically significant improvement (p < {alpha})")
    print(f"• {lift:.1f}% relative increase in conversion rate")
    print(f"• Projected additional revenue: ${additional_revenue:,.0f}/month")
    print(f"• 95% confident the true improvement is between {ci_lower*100:.2f}% and {ci_upper*100:.2f}%")
    print()
    print("IMPLEMENTATION PLAN:")
    print("1. Gradually roll out Design B to all users")
    print("2. Monitor key metrics for any unexpected changes")
    print("3. Continue A/B testing for further optimizations")
    print("4. Document learnings for future experiments")
    
else:
    print("⚠️  RECOMMENDATION: DO NOT IMPLEMENT DESIGN B")
    print()
    print("JUSTIFICATION:")
    print(f"• No statistically significant improvement detected")
    print(f"• Risk of implementing a change that may not actually improve performance")
    print(f"• Current evidence insufficient to justify the change")
    print()
    print("NEXT STEPS:")
    print("1. Investigate why the expected improvement wasn't observed")
    print("2. Consider testing with larger sample sizes")
    print("3. Explore alternative design variations")
    print("4. Analyze user segments for differential effects")

print()
print("GENERAL CONSIDERATIONS:")
print("• Ensure technical implementation is error-free")
print("• Consider long-term effects beyond immediate conversion")
print("• Monitor for any negative impacts on other metrics")
print("• Plan for ongoing experimentation and optimization")

In [None]:
# Create comprehensive results visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Conversion rates comparison
groups = ['Group A\n(Control)', 'Group B\n(Treatment)']
rates = [p_a, p_b]
colors = ['lightblue', 'lightcoral']

bars = ax1.bar(groups, rates, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Conversion Rate')
ax1.set_title('Conversion Rate Comparison')
ax1.set_ylim(0, max(rates) * 1.2)

# Add value labels on bars
for bar, rate in zip(bars, rates):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.002,
             f'{rate:.2%}', ha='center', va='bottom', fontweight='bold')

# 2. Statistical significance visualization
categories = ['T-statistic', 'Critical Value']
values = [abs(t_statistic), critical_t]
colors_stat = ['green' if abs(t_statistic) > critical_t else 'red', 'red']

bars2 = ax2.bar(categories, values, color=colors_stat, alpha=0.7, edgecolor='black')
ax2.set_ylabel('T-value')
ax2.set_title('Statistical Significance Test')
ax2.axhline(y=critical_t, color='red', linestyle='--', alpha=0.7, label=f'Critical Value ({critical_t:.3f})')

for bar, value in zip(bars2, values):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.05,
             f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

# 3. Confidence intervals
x_pos = [0, 1]
means = [p_a, p_b]
ci_errors = [[p_a - ci_a_lower, p_b - ci_b_lower], 
             [ci_a_upper - p_a, ci_b_upper - p_b]]
ax3.errorbar(x_pos, means, yerr=ci_errors, fmt='o', color='black', 
             ecolor='lightgray', elinewidth=3, capsize=5, label='Confidence Interval')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(['Group A\n(Control)', 'Group B\n(Treatment)'])
ax3.set_ylabel('Conversion Rate')
ax3.set_title('Conversion Rate with 95% Confidence Interval')

for i, mean in enumerate(means):
    ax3.text(x_pos[i], mean + 0.01, f'{mean:.2%}', ha='center', va='bottom', fontweight='bold')

# 4. Effect Size Visualization
effect_size = (p_a - p_b) / np.sqrt((np.var(group_a) + np.var(group_b)) / 2)
ax4.bar(['Effect Size'], [effect_size], color='purple', alpha=0.7, edgecolor='black')
ax4.set_ylabel('Effect Size')
ax4.set_title('Effect Size between Groups')
ax4.set_ylim(0, 1)  # Set a sensible range for effect size

# Annotate the effect size on the bar
ax4.text(0, effect_size + 0.02, f'{effect_size:.2f}', ha='center', va='bottom', fontweight='bold')






