In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print("Python environment ready for A/B testing analysis")
Subtask 1.3: Create Sample A/B Test Dataset
Since we're working with a controlled lab environment, we'll create a realistic eCommerce A/B test dataset:

# Set random seed for reproducibility
np.random.seed(42)

# Create sample A/B test data for an eCommerce website
# Scenario: Testing a new checkout button design

def generate_ab_test_data(n_users=10000):
    """
    Generate sample A/B test data for eCommerce conversion testing
    Group A: Control (original checkout button)
    Group B: Test (new checkout button design)
    """
    
    # Generate user IDs
    user_ids = [f"user_{i:05d}" for i in range(1, n_users + 1)]
    
    # Randomly assign users to groups (50/50 split)
    groups = np.random.choice(['A', 'B'], size=n_users, p=[0.5, 0.5])
    
    # Simulate different conversion rates
    # Group A (Control): 12% conversion rate
    # Group B (Test): 15% conversion rate (3% lift)
    
    conversions = []
    page_views = []
    time_on_site = []
    
    for group in groups:
        if group == 'A':
            # Control group metrics
            converted = np.random.choice([0, 1], p=[0.88, 0.12])  # 12% conversion
            views = np.random.randint(1, 8)  # 1-7 page views
            time_spent = np.random.normal(180, 60)  # ~3 minutes average
        else:
            # Test group metrics (improved performance)
            converted = np.random.choice([0, 1], p=[0.85, 0.15])  # 15% conversion
            views = np.random.randint(1, 9)  # Slightly more engagement
            time_spent = np.random.normal(200, 50)  # Slightly longer time
        
        conversions.append(converted)
        page_views.append(views)
        time_on_site.append(max(30, time_spent))  # Minimum 30 seconds
    
    # Create DataFrame
    df = pd.DataFrame({
        'user_id': user_ids,
        'group': groups,
        'converted': conversions,
        'page_views': page_views,
        'time_on_site': time_on_site,
        'revenue': [np.random.normal(50, 15) if conv else 0 
                   for conv in conversions]
    })
    
    return df

# Generate the dataset
ab_test_data = generate_ab_test_data(10000)

# Save to CSV for future use
ab_test_data.to_csv('ecommerce_ab_test_data.csv', index=False)

print("A/B test dataset created successfully!")
print(f"Dataset shape: {ab_test_data.shape}")
print("\nFirst 5 rows:")
print(ab_test_data.head())
Subtask 1.4: Explore the Dataset
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Total users: {len(ab_test_data)}")
print(f"Columns: {list(ab_test_data.columns)}")
print(f"Data types:\n{ab_test_data.dtypes}")

print("\n=== GROUP DISTRIBUTION ===")
print(ab_test_data['group'].value_counts())

print("\n=== BASIC STATISTICS ===")
print(ab_test_data.describe())

print("\n=== MISSING VALUES CHECK ===")
print(ab_test_data.isnull().sum())

# Quick sample size check
def minimum_sample_size(baseline_rate, minimum_detectable_effect, alpha=0.05, power=0.8):
    """Calculate minimum sample size for A/B test"""
    from scipy.stats import norm
    
    z_alpha = norm.ppf(1 - alpha/2)
    z_beta = norm.ppf(power)
    
    p1 = baseline_rate
    p2 = baseline_rate * (1 + minimum_detectable_effect)
    
    pooled_p = (p1 + p2) / 2
    
    n = (2 * pooled_p * (1 - pooled_p) * (z_alpha + z_beta)**2) / (p1 - p2)**2
    
    return int(np.ceil(n))

# Example usage
baseline = 0.12  # 12% baseline conversion rate
mde = 0.20  # Want to detect 20% relative improvement
min_sample = minimum_sample_size(baseline, mde)
print(f"Minimum sample size needed per group: {min_sample:,}")

# Split the data into control and test groups
group_a = ab_test_data[ab_test_data['group'] == 'A'].copy()
group_b = ab_test_data[ab_test_data['group'] == 'B'].copy()

print("=== GROUP SPLIT SUMMARY ===")
print(f"Group A (Control) size: {len(group_a)}")
print(f"Group B (Test) size: {len(group_b)}")
print(f"Split ratio: {len(group_a)/len(ab_test_data):.1%} / {len(group_b)/len(ab_test_data):.1%}")

# Verify the split is balanced
print(f"\nGroup balance check:")
print(f"Difference in group sizes: {abs(len(group_a) - len(group_b))} users")
print(f"Balance quality: {'Good' if abs(len(group_a) - len(group_b)) < 100 else 'Needs attention'}")
Subtask 2.2: Validate Group Characteristics
# Compare baseline characteristics between groups
print("=== GROUP CHARACTERISTICS COMPARISON ===")

characteristics = ['page_views', 'time_on_site']

for char in characteristics:
    group_a_mean = group_a[char].mean()
    group_b_mean = group_b[char].mean()
    
    print(f"\n{char.replace('_', ' ').title()}:")
    print(f"  Group A (Control): {group_a_mean:.2f}")
    print(f"  Group B (Test): {group_b_mean:.2f}")
    print(f"  Difference: {group_b_mean - group_a_mean:.2f}")
    
    # Statistical test for difference
    t_stat, p_value = stats.ttest_ind(group_a[char], group_b[char])
    print(f"  P-value: {p_value:.4f} ({'Significant' if p_value < 0.05 else 'Not significant'})")
Task 3: Compute Conversion Rates and Lift
Subtask 3.1: Calculate Basic Conversion Metrics
def calculate_conversion_metrics(group_data, group_name):
    """Calculate conversion metrics for a group"""
    total_users = len(group_data)
    conversions = group_data['converted'].sum()
    conversion_rate = conversions / total_users
    
    # Calculate confidence interval for conversion rate
    z_score = 1.96  # 95% confidence interval
    margin_of_error = z_score * np.sqrt((conversion_rate * (1 - conversion_rate)) / total_users)
    ci_lower = conversion_rate - margin_of_error
    ci_upper = conversion_rate + margin_of_error
    
    return {
        'group': group_name,
        'total_users': total_users,
        'conversions': conversions,
        'conversion_rate': conversion_rate,
        'conversion_rate_pct': conversion_rate * 100,
        'ci_lower': ci_lower * 100,
        'ci_upper': ci_upper * 100
    }

# Calculate metrics for both groups
metrics_a = calculate_conversion_metrics(group_a, 'A (Control)')
metrics_b = calculate_conversion_metrics(group_b, 'B (Test)')

print("=== CONVERSION RATE ANALYSIS ===")
print(f"\nGroup A (Control):")
print(f"  Users: {metrics_a['total_users']:,}")
print(f"  Conversions: {metrics_a['conversions']:,}")
print(f"  Conversion Rate: {metrics_a['conversion_rate_pct']:.2f}%")
print(f"  95% CI: [{metrics_a['ci_lower']:.2f}%, {metrics_a['ci_upper']:.2f}%]")

print(f"\nGroup B (Test):")
print(f"  Users: {metrics_b['total_users']:,}")
print(f"  Conversions: {metrics_b['conversions']:,}")
print(f"  Conversion Rate: {metrics_b['conversion_rate_pct']:.2f}%")
print(f"  95% CI: [{metrics_b['ci_lower']:.2f}%, {metrics_b['ci_upper']:.2f}%]")

def calculate_lift_and_significance(metrics_a, metrics_b):
    """Calculate lift and statistical significance"""
    
    # Absolute and relative lift
    absolute_lift = metrics_b['conversion_rate'] - metrics_a['conversion_rate']
    relative_lift = (absolute_lift / metrics_a['conversion_rate']) * 100
    
    # Statistical significance test (Chi-square test)
    # Create contingency table
    conversions_a = metrics_a['conversions']
    conversions_b = metrics_b['conversions']
    non_conversions_a = metrics_a['total_users'] - conversions_a
    non_conversions_b = metrics_b['total_users'] - conversions_b
    
    contingency_table = np.array([
        [conversions_a, non_conversions_a],
        [conversions_b, non_conversions_b]
    ])
    
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    # Effect size (Cram√©r's V)
    n = contingency_table.sum()
    cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
    
    return {
        'absolute_lift': absolute_lift * 100,
        'relative_lift': relative_lift,
        'p_value': p_value,
        'is_significant': p_value < 0.05,
        'chi2_statistic': chi2,
        'cramers_v': cramers_v
    }

# Calculate lift and significance
lift_results = calculate_lift_and_significance(metrics_a, metrics_b)

print("=== LIFT ANALYSIS ===")
print(f"Absolute Lift: {lift_results['absolute_lift']:.2f} percentage points")
print(f"Relative Lift: {lift_results['relative_lift']:.2f}%")
print(f"P-value: {lift_results['p_value']:.6f}")
print(f"Statistical Significance: {'YES' if lift_results['is_significant'] else 'NO'} (Œ± = 0.05)")
print(f"Effect Size (Cram√©r's V): {lift_results['cramers_v']:.4f}")

# Interpretation
if lift_results['is_significant']:
    direction = "increase" if lift_results['relative_lift'] > 0 else "decrease"
    print(f"\nüéØ RESULT: The test shows a statistically significant {direction} in conversion rate!")
else:
    print(f"\n‚ö†Ô∏è  RESULT: No statistically significant difference detected.")

# Create a comprehensive visualization dashboard
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('A/B Test Results Dashboard', fontsize=16, fontweight='bold')

# 1. Conversion Rate Comparison Bar Chart
ax1 = axes[0, 0]
groups = ['Group A\n(Control)', 'Group B\n(Test)']
conversion_rates = [metrics_a['conversion_rate_pct'], metrics_b['conversion_rate_pct']]
colors = ['#FF6B6B', '#4ECDC4']

bars = ax1.bar(groups, conversion_rates, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
ax1.set_ylabel('Conversion Rate (%)')
ax1.set_title('Conversion Rate Comparison')
ax1.set_ylim(0, max(conversion_rates) * 1.2)

# Add value labels on bars
for bar, rate in zip(bars, conversion_rates):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{rate:.2f}%', ha='center', va='bottom', fontweight='bold')

# Add confidence intervals
ax1.errorbar(range(len(groups)), conversion_rates,
             yerr=[[metrics_a['conversion_rate_pct'] - metrics_a['ci_lower'],
                    metrics_b['conversion_rate_pct'] - metrics_b['ci_lower']],
                   [metrics_a['ci_upper'] - metrics_a['conversion_rate_pct'],
                    metrics_b['ci_upper'] - metrics_b['conversion_rate_pct']]],
             fmt='none', color='black', capsize=5, capthick=2)

# 2. Sample Size Comparison
ax2 = axes[0, 1]
sample_sizes = [metrics_a['total_users'], metrics_b['total_users']]
bars2 = ax2.bar(groups, sample_sizes, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
ax2.set_ylabel('Number of Users')
ax2.set_title('Sample Size Comparison')

for bar, size in zip(bars2, sample_sizes):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 50,
             f'{size:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# 3. Conversion Distribution Pie Charts
fig, (ax3, ax4) = plt.subplots(1, 2, figsize=(15, 6))

# Group A pie chart
group_a_counts = [metrics_a['conversions'], metrics_a['total_users'] - metrics_a['conversions']]
ax3.pie(group_a_counts, labels=['Converted', 'Not Converted'], autopct='%1.1f%%',
        colors=['#FF6B6B', '#FFE5E5'], startangle=90)
ax3.set_title(f'Group A (Control)\nConversion Distribution\nTotal: {metrics_a["total_users"]:,} users')

# Group B pie chart
group_b_counts = [metrics_b['conversions'], metrics_b['total_users'] - metrics_b['conversions']]
ax4.pie(group_b_counts, labels=['Converted', 'Not Converted'], autopct='%1.1f%%',
        colors=['#4ECDC4', '#E5F9F6'], startangle=90)
ax4.set_title(f'Group B (Test)\nConversion Distribution\nTotal: {metrics_b["total_users"]:,} users')

plt.tight_layout()
plt.show()

# 4. Statistical Summary Table Visualization
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('tight')
ax.axis('off')

# Prepare summary data
summary_data = [
    ['Metric', 'Group A (Control)', 'Group B (Test)', 'Difference'],
    ['Sample Size', f"{metrics_a['total_users']:,}", f"{metrics_b['total_users']:,}", 
     f"{metrics_b['total_users'] - metrics_a['total_users']:,}"],
    ['Conversions', f"{metrics_a['conversions']:,}", f"{metrics_b['conversions']:,}", 
     f"{metrics_b['conversions'] - metrics_a['conversions']:,}"],
    ['Conversion Rate', f"{metrics_a['conversion_rate_pct']:.2f}%", 
     f"{metrics_b['conversion_rate_pct']:.2f}%", 
     f"{lift_results['absolute_lift']:.2f}pp"],
    ['Relative Lift', '-', f"{lift_results['relative_lift']:.2f}%", '-'],
    ['P-value', '-', f"{lift_results['p_value']:.6f}", '-'],
    ['Significant?', '-', 'YES' if lift_results['is_significant'] else 'NO', '-']
]

# Create table
table = ax.table(cellText=summary_data[1:], colLabels=summary_data[0],
                cellLoc='center', loc='center', bbox=[0, 0, 1, 1])

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1.2, 2)

# Color code the header
for i in range(len(summary_data[0])):
    table[(0, i)].set_facecolor('#4ECDC4')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Color code significant results
if lift_results['is_significant']:
    table[(6, 2)].set_facecolor('#90EE90')  # Light green for significant
else:
    table[(6, 2)].set_facecolor('#FFB6C1')  # Light red for not significant

ax.set_title('A/B Test Statistical Summary', fontsize=16, fontweight='bold', pad=20)
plt.show()

# 5. Revenue Analysis (if applicable)
fig, (ax5, ax6) = plt.subplots(1, 2, figsize=(15, 6))

# Revenue per user comparison
revenue_a = group_a[group_a['converted'] == 1]['revenue'].mean()
revenue_b = group_b[group_b['converted'] == 1]['revenue'].mean()

ax5.bar(['Group A\n(Control)', 'Group B\n(Test)'], [revenue_a, revenue_b], 
        color=colors, alpha=0.8, edgecolor='black', linewidth=1)
ax5.set_ylabel('Average Revenue per Conversion ($)')
ax5.set_title('Revenue per Conversion Comparison')

# Add value labels
for i, (group, revenue) in enumerate(zip(['Group A', 'Group B'], [revenue_a, revenue_b])):
    ax5.text(i, revenue + 1, f'${revenue:.2f}', ha='center', va='bottom', fontweight='bold')

# Time on site comparison
time_a = group_a['time_on_site'].mean()
time_b = group_b['time_on_site'].mean()

ax6.bar(['Group A\n(Control)', 'Group B\n(Test)'], [time_a/60, time_b/60], 
        color=colors, alpha=0.8, edgecolor='black', linewidth=1)
ax6.set_ylabel('Average Time on Site (minutes)')
ax6.set_title('User Engagement Comparison')

# Add value labels
for i, (group, time) in enumerate(zip(['Group A', 'Group B'], [time_a/60, time_b/60])):
    ax6.text(i, time + 0.1, f'{time:.1f}m', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

def generate_test_insights(metrics_a, metrics_b, lift_results):
    """Generate automated insights from A/B test results"""
    
    insights = []
    
    # Sample size assessment
    total_sample = metrics_a['total_users'] + metrics_b['total_users']
    insights.append(f"üìä Sample Size: {total_sample:,} users tested ({metrics_a['total_users']:,} control, {metrics_b['total_users']:,} test)")
    
    # Conversion rate comparison
    if metrics_b['conversion_rate_pct'] > metrics_a['conversion_rate_pct']:
        insights.append(f"üìà Test group outperformed control by {lift_results['absolute_lift']:.2f} percentage points")
    else:
        insights.append(f"üìâ Test group underperformed control by {abs(lift_results['absolute_lift']):.2f} percentage points")
    
    # Statistical significance
    if lift_results['is_significant']:
        insights.append(f"‚úÖ Results are statistically significant (p = {lift_results['p_value']:.6f})")
        insights.append(f"üéØ Relative improvement: {lift_results['relative_lift']:.1f}%")
    else:
        insights.append(f"‚ö†Ô∏è  Results are NOT statistically significant (p = {lift_results['p_value']:.6f})")
        insights.append("üîÑ Consider running the test longer or with more users")
    
    # Effect size interpretation
    if lift_results['cramers_v'] < 0.1:
        effect_size = "small"
    elif lift_results['cramers_v'] < 0.3:
        effect_size = "medium"
    else:
        effect_size = "large"
    
    insights.append(f"üìè Effect size: {effect_size} (Cram√©r's V = {lift_results['cramers_v']:.4f})")
    
    return insights

# Generate and display insights
insights = generate_test_insights(metrics_a, metrics_b, lift_results)

print("=== A/B TEST INSIGHTS ===")
for insight in insights:
    print(insight)

def generate_business_recommendations(metrics_a, metrics_b, lift_results):
    """Generate business recommendations based on test results"""
    
    recommendations = []
    
    if lift_results['is_significant'] and lift_results['relative_lift'] > 0:
        recommendations.append("üöÄ RECOMMENDATION: Implement the test variation")
        recommendations.append(f"üí∞ Expected impact: {lift_results['relative_lift']:.1f}% increase in conversions")
        
        # Calculate potential revenue impact
        annual_users = 100000  # Assume 100k annual users
        additional_conversions = annual_users * (lift_results['absolute_lift'] / 100)
        avg_revenue = 50  # Assume $50 average order value
        revenue_impact = additional_conversions * avg_revenue
        
        recommendations.append(f"üíµ Estimated annual revenue impact: ${revenue_impact:,.0f}")
        
    elif lift_results['is_significant'] and lift_results['relative_lift'] < 0:
        recommendations.append("‚ùå RECOMMENDATION: Do NOT implement the test variation")
        recommendations.append("üîÑ Consider testing alternative approaches")
        
    else:
        recommendations.append("ü§î RECOMMENDATION: Results are inconclusive")
        recommendations.append("üìä Options:")
        recommendations.append("   ‚Ä¢ Extend test duration")
        recommendations.append("   ‚Ä¢ Increase sample size")
        recommendations.append("   ‚Ä¢ Test more dramatic changes")
    
    # Additional recommendations
    recommendations.append("\nüìã NEXT STEPS:")
    recommendations.append("‚Ä¢ Document test methodology and results")
    recommendations.append("‚Ä¢ Share findings with stakeholders")
    recommendations.append("‚Ä¢ Plan follow-up tests or implementation")
    
    return recommendations

# Generate and display recommendations
recommendations = generate_business_recommendations(metrics_a, metrics_b, lift_results)

print("\n=== BUSINESS RECOMMENDATIONS ===")
for rec in recommendations:
    print(rec)

# Create a comprehensive summary report
def create_summary_report(metrics_a, metrics_b, lift_results):
    """Create a formatted summary report"""
    
    report = f"""
A/B TEST SUMMARY REPORT
=======================
Test Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Test Type: Conversion Rate Optimization
Hypothesis: New checkout button design will increase conversion rates

SAMPLE COMPOSITION:
‚Ä¢ Total Users: {metrics_a['total_users'] + metrics_b['total_users']:,}
‚Ä¢ Control Group (A): {metrics_a['total_users']:,} users
‚Ä¢ Test Group (B): {metrics_b['total_users']:,} users
‚Ä¢ Split Ratio: 50/50

KEY METRICS:
‚Ä¢ Control Conversion Rate: {metrics_a['conversion_rate_pct']:.2f}%
‚Ä¢ Test Conversion Rate: {metrics_b['conversion_rate_pct']:.2f}%
‚Ä¢ Absolute Lift: {lift_results['absolute_lift']:.2f} percentage points
‚Ä¢ Relative Lift: {lift_results['relative_lift']:.2f}%

STATISTICAL ANALYSIS:
‚Ä¢ P-value: {lift_results['p_value']:.6f}
‚Ä¢ Statistically Significant: {'Yes' if lift_results['is_significant'] else 'No'}
‚Ä¢ Confidence Level: 95%
‚Ä¢ Effect Size: {lift_results['cramers_v']:.4f}

CONCLUSION:
{'The test variation shows a statistically significant improvement.' if lift_results['is_significant'] and lift_results['relative_lift'] > 0 else 'No significant improvement detected.' if not lift_results['is_significant'] else 'The test variation shows a statistically significant decrease.'}

RECOMMENDATION:
{'Implement the test variation' if lift_results['is_significant'] and lift_results['relative_lift'] > 0 else 'Do not implement' if lift_results['is_significant'] and lift_results['relative_lift'] < 0 else 'Continue testing or increase sample size'}
"""
    
    return report

# Generate and save report
summary_report = create_summary_report(metrics_a, metrics_b, lift_results)
print(summary_report)

# Save report to file
with open('ab_test_summary_report.txt', 'w') as f:
    f.write(summary_report)

print("\nüìÑ Summary report saved to 'ab_test_summary_report.txt'")

# Export detailed results to CSV
results_df = pd.DataFrame({
    'Metric': ['Sample Size', 'Conversions', 'Conversion Rate (%)', 'CI Lower (%)', 'CI Upper (%)'],
    'Group_A_Control': [
        metrics_a['total_users'],
        metrics_a['conversions'],
        round(metrics_a['conversion_rate_pct'], 2),
        round(metrics_a['ci_lower'], 2),
        round(metrics_a['ci_upper'], 2)
    ],
    'Group_B_Test': [
        metrics_b['total_users'],
        metrics_b['conversions'],
        round(metrics_b['conversion_rate_pct'], 2),
        round(metrics_b['ci_lower'], 2),
        round(metrics_b['ci_upper'], 2)
    ]
})

# Add statistical test results
statistical_results = pd.DataFrame({
    'Statistical_Measure': ['Absolute Lift (pp)', 'Relative Lift (%)', 'P-value', 'Significant', 'Effect Size'],
    'Value': [
        round(lift_results['absolute_lift'], 2),
        round(lift_results['relative_lift'], 2),
        lift_results['p_value'],
        'Yes' if lift_results['is_significant'] else 'No',
        round(lift_results['cramers_v'], 4)
    ]
})

# Save results
results_df.to_csv('ab_test_results.csv', index=False)
statistical_results.to_csv('ab_test_statistics.csv', index=False)

print("üìä Results exported to:")
print("‚Ä¢ ab_test_results.csv")
print("‚Ä¢ ab_test_statistics.csv")
print("‚Ä¢ ecommerce_ab_test_data.csv (original data)")

# Display final summary
print("\n=== FILES CREATED ===")
import os
files = [f for f in os.listdir('.') if f.endswith(('.csv', '.txt'))]
for file in files:
    size = os.path.getsize(file)
    print(f"üìÅ {file} ({size:,} bytes)")