In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random
from scipy import stats
import statsmodels.stats.api as sms
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import json
import warnings
import dotenv
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Load environment variables
dotenv.load_dotenv()

# Set styling for visualizations
plt.style.use('seaborn-whitegrid')
sns.set(style="whitegrid", palette="muted", color_codes=True)
plt.rcParams['figure.figsize'] = (12, 8)

print("Setup complete!")

# Add project root to path to import custom modules
sys.path.append("../../")

# Import PesaGuru custom modules
try:
    from utils.jupyter_helpers.data_loaders import load_test_data
    from utils.jupyter_helpers.visualization import plot_ab_test_results
    print("Custom PesaGuru modules imported successfully!")
except ImportError:
    print("Warning: Custom modules not found. Using fallback implementations.")


🔍 Environment Variables:
DATABASE_URL: None
API_KEY: None

✅ Available Matplotlib styles: ['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


OSError: 'seaborn-white' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [None]:
 # Fallback implementations
def load_test_data(source, start_date=None, end_date=None):
        """Load test data from various sources."""
        if source == 'firebase':
            # Mock implementation
            return pd.DataFrame({
                'user_id': [f'user_{i}' for i in range(1000)],
                'group': np.random.choice(['A', 'B'], size=1000),
                'engagement_time': np.random.normal(120, 60, 1000),
                'messages_sent': np.random.poisson(8, 1000),
                'task_completed': np.random.choice([0, 1], size=1000, p=[0.3, 0.7]),
                'satisfaction_rating': np.random.choice([1, 2, 3, 4, 5], size=1000),
                'language': np.random.choice(['English', 'Swahili'], size=1000, p=[0.7, 0.3]),
                'device_type': np.random.choice(['mobile', 'desktop', 'tablet'], size=1000, p=[0.6, 0.3, 0.1]),
                'age_group': np.random.choice(['18-24', '25-34', '35-44', '45+'], size=1000),
                'feature_used': np.random.choice(['investment_recommendation', 'loan_comparison', 'budget_planning', 'market_analysis'], size=1000)
            })
        elif source == 'csv':
            # Mock implementation - in real scenario, would load from filepath
            return pd.DataFrame({
                'user_id': [f'user_{i}' for i in range(500)],
                'group': np.random.choice(['A', 'B'], size=500),
                'engagement_time': np.random.normal(120, 60, 500),
                'messages_sent': np.random.poisson(8, 500),
                'task_completed': np.random.choice([0, 1], size=500, p=[0.3, 0.7]),
                'satisfaction_rating': np.random.choice([1, 2, 3, 4, 5], size=500),
            })
        else:
            raise ValueError(f"Unknown data source: {source}")
    
def plot_ab_test_results(df, metric, group_col='group', title=None):
        """Simple plotting function for A/B test results."""
        plt.figure(figsize=(10, 6))
        sns.barplot(x=group_col, y=metric, data=df)
        plt.title(title or f'A/B Test Results: {metric} by {group_col}')
        plt.ylabel(metric)
        plt.xlabel(group_col)
        plt.show()

In [None]:
# PesaGuru chatbot features we want to test
ab_test_configs = [
    {
        'test_name': 'language_preference',
        'description': 'Testing user engagement with English vs Swahili interface',
        'variant_a': 'English only interface',
        'variant_b': 'Bilingual interface with language toggle',
        'primary_metric': 'engagement_time',
        'secondary_metrics': ['messages_sent', 'task_completed', 'satisfaction_rating'],
        'test_duration_days': 14,
        'min_sample_size': 500
    },
    {
        'test_name': 'recommendation_algorithm',
        'description': 'Testing rule-based vs ML-based investment recommendations',
        'variant_a': 'Rule-based recommendations',
        'variant_b': 'ML-based personalized recommendations',
        'primary_metric': 'conversion_rate',
        'secondary_metrics': ['satisfaction_rating', 'recommendation_clicks'],
        'test_duration_days': 21,
        'min_sample_size': 800
    },
    {
        'test_name': 'response_style',
        'description': 'Testing formal vs conversational response styles',
        'variant_a': 'Formal, professional responses',
        'variant_b': 'Conversational, friendly responses',
        'primary_metric': 'satisfaction_rating',
        'secondary_metrics': ['engagement_time', 'messages_sent'],
        'test_duration_days': 14,
        'min_sample_size': 600
    },
    {
        'test_name': 'ui_design',
        'description': 'Testing different UI layouts for the chatbot',
        'variant_a': 'Text-focused UI with minimal graphics',
        'variant_b': 'Visual UI with charts and financial graphics',
        'primary_metric': 'task_completed',
        'secondary_metrics': ['time_to_completion', 'satisfaction_rating'],
        'test_duration_days': 14,
        'min_sample_size': 700
    }
]

# Print the test configurations
print("Defined A/B Test Configurations:")
for i, config in enumerate(ab_test_configs):
    print(f"\nTest {i+1}: {config['test_name']}")
    print(f"Description: {config['description']}")
    print(f"Variant A: {config['variant_a']}")
    print(f"Variant B: {config['variant_b']}")
    print(f"Primary Metric: {config['primary_metric']}")
    print(f"Test Duration: {config['test_duration_days']} days")

In [None]:
def calculate_sample_size(baseline_conversion=0.10, minimum_detectable_effect=0.05, 
                          alpha=0.05, power=0.8):
    """
    Calculate the required sample size for an A/B test
    
    Parameters:
    - baseline_conversion: Expected conversion rate for control group
    - minimum_detectable_effect: Smallest meaningful difference to detect
    - alpha: Significance level (Type I error probability)
    - power: 1 - Type II error probability
    
    Returns:
    - sample_size_per_group: Required sample size per variant
    """
    # Standard normal critical values for alpha and beta
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)
    
    # Standard deviations under null and alternative
    sd1 = np.sqrt(2 * baseline_conversion * (1 - baseline_conversion))
    sd2 = np.sqrt(baseline_conversion * (1 - baseline_conversion) + 
                 (baseline_conversion + minimum_detectable_effect) * 
                 (1 - (baseline_conversion + minimum_detectable_effect)))
    
    # Calculate sample size
    sample_size_per_group = ((z_alpha * sd1 + z_beta * sd2) / minimum_detectable_effect) ** 2
    
    return np.ceil(sample_size_per_group)

# Calculate and display sample sizes for each test
print("\nSample Size Calculations:")
for config in ab_test_configs:
    # Different baseline and MDE for different metrics
    if config['primary_metric'] == 'conversion_rate':
        baseline = 0.10
        mde = 0.05
    elif config['primary_metric'] == 'satisfaction_rating':
        baseline = 0.40  # 40% give 4 or 5 stars
        mde = 0.10
    elif config['primary_metric'] == 'engagement_time':
        baseline = 0.30  # 30% engage longer than 3 minutes
        mde = 0.08
    else:
        baseline = 0.20
        mde = 0.07
    
    sample_size = calculate_sample_size(baseline, mde)
    print(f"Test: {config['test_name']}")
    print(f"Required sample size per group: {int(sample_size)}")
    print(f"Minimum sample size set in config: {config['min_sample_size']}")
    
    if sample_size > config['min_sample_size']:
        print(f"WARNING: Configured sample size may be too small for desired statistical power!\n")
    else:
        print(f"Configured sample size is sufficient.\n")

In [None]:
def simulate_ab_test_data(config, n_users=1000, seed=42):
    """
    Simulate data for an A/B test based on configuration
    
    Parameters:
    - config: Dictionary with test configuration
    - n_users: Number of users to simulate
    - seed: Random seed for reproducibility
    
    Returns:
    - DataFrame with simulated test data
    """
    np.random.seed(seed)
    random.seed(seed)
    
    # Create base dataframe
    data = {
        'user_id': [f'user_{i}' for i in range(n_users)],
        'group': np.random.choice(['A', 'B'], size=n_users),
        'test_name': config['test_name'],
        'timestamp': [datetime.now() - timedelta(days=random.randint(0, config['test_duration_days'])) 
                      for _ in range(n_users)]
    }
    
    df = pd.DataFrame(data)
    
    # Add demographic information
    df['age_group'] = np.random.choice(['18-24', '25-34', '35-44', '45+'], size=n_users)
    df['location'] = np.random.choice(['Nairobi', 'Mombasa', 'Kisumu', 'Nakuru', 'Other'], 
                                     size=n_users, p=[0.4, 0.2, 0.1, 0.1, 0.2])
    df['device'] = np.random.choice(['mobile', 'desktop', 'tablet'], 
                                   size=n_users, p=[0.7, 0.2, 0.1])
    
    # Simulate effect based on test type
    if config['test_name'] == 'language_preference':
        # Group A: English only - lower engagement for Swahili speakers
        # Group B: Bilingual - higher engagement overall
        
        # First determine language preference
        df['preferred_language'] = np.random.choice(['English', 'Swahili'], size=n_users, p=[0.65, 0.35])
        
        # Engagement time (minutes)
        df['engagement_time'] = np.where(
            df['group'] == 'A',
            # Group A: Lower engagement time for Swahili speakers
            np.where(df['preferred_language'] == 'English',
                    np.random.normal(5.2, 2.0, n_users),  # English speakers
                    np.random.normal(2.8, 1.5, n_users)),  # Swahili speakers
            # Group B: Higher engagement overall, especially for Swahili speakers
            np.where(df['preferred_language'] == 'English',
                    np.random.normal(5.5, 2.0, n_users),  # English speakers
                    np.random.normal(5.0, 2.0, n_users))   # Swahili speakers
        )
        
        # Number of messages
        df['messages_sent'] = np.where(
            df['group'] == 'A',
            np.where(df['preferred_language'] == 'English',
                    np.random.poisson(8, n_users),
                    np.random.poisson(5, n_users)),
            np.where(df['preferred_language'] == 'English',
                    np.random.poisson(9, n_users),
                    np.random.poisson(8, n_users))
        )
        
        # Task completion (1 = completed, 0 = not completed)
        p_complete_a_english = 0.70
        p_complete_a_swahili = 0.40
        p_complete_b_english = 0.72
        p_complete_b_swahili = 0.65
        
        df['task_completed'] = np.where(
            df['group'] == 'A',
            np.where(df['preferred_language'] == 'English',
                    np.random.binomial(1, p_complete_a_english, n_users),
                    np.random.binomial(1, p_complete_a_swahili, n_users)),
            np.where(df['preferred_language'] == 'English',
                    np.random.binomial(1, p_complete_b_english, n_users),
                    np.random.binomial(1, p_complete_b_swahili, n_users))
        )
        
        # Satisfaction rating (1-5)
        satisfaction_a_english = np.random.choice([1, 2, 3, 4, 5], n_users, 
                                                p=[0.05, 0.10, 0.25, 0.40, 0.20])
        satisfaction_a_swahili = np.random.choice([1, 2, 3, 4, 5], n_users, 
                                                p=[0.15, 0.25, 0.35, 0.20, 0.05])
        satisfaction_b_english = np.random.choice([1, 2, 3, 4, 5], n_users, 
                                                p=[0.03, 0.07, 0.20, 0.45, 0.25])
        satisfaction_b_swahili = np.random.choice([1, 2, 3, 4, 5], n_users, 
                                                p=[0.05, 0.10, 0.25, 0.40, 0.20])
        
        df['satisfaction_rating'] = np.where(
            df['group'] == 'A',
            np.where(df['preferred_language'] == 'English',
                    satisfaction_a_english,
                    satisfaction_a_swahili),
            np.where(df['preferred_language'] == 'English',
                    satisfaction_b_english,
                    satisfaction_b_swahili)
        )
        
    elif config['test_name'] == 'recommendation_algorithm':
        # Group A: Rule-based recommendations
        # Group B: ML-based personalized recommendations
        
        # Conversion rate (1 = converted, 0 = not converted)
        p_convert_a = 0.10  # Rule-based
        p_convert_b = 0.15  # ML-based
        
        df['conversion_rate'] = np.where(
            df['group'] == 'A',
            np.random.binomial(1, p_convert_a, n_users),
            np.random.binomial(1, p_convert_b, n_users)
        )
        
        # Recommendation clicks
        df['recommendation_clicks'] = np.where(
            df['group'] == 'A',
            np.random.poisson(2, n_users),
            np.random.poisson(3, n_users)
        )
        
        # Satisfaction rating (1-5)
        df['satisfaction_rating'] = np.where(
            df['group'] == 'A',
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.05, 0.15, 0.30, 0.35, 0.15]),
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.03, 0.12, 0.25, 0.40, 0.20])
        )
        
        # Amount invested (KES)
        df['amount_invested'] = np.where(
            (df['conversion_rate'] == 1) & (df['group'] == 'A'),
            np.random.normal(15000, 5000, n_users),
            np.where(
                (df['conversion_rate'] == 1) & (df['group'] == 'B'),
                np.random.normal(18000, 6000, n_users),
                0
            )
        )
        
    elif config['test_name'] == 'response_style':
        # Group A: Formal responses
        # Group B: Conversational responses
        
        # Satisfaction rating (1-5)
        df['satisfaction_rating'] = np.where(
            df['group'] == 'A',
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.05, 0.15, 0.40, 0.30, 0.10]),
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.03, 0.10, 0.27, 0.40, 0.20])
        )
        
        # Engagement time (minutes)
        df['engagement_time'] = np.where(
            df['group'] == 'A',
            np.random.normal(4.5, 2.0, n_users),
            np.random.normal(6.2, 2.5, n_users)
        )
        
        # Number of messages
        df['messages_sent'] = np.where(
            df['group'] == 'A',
            np.random.poisson(6, n_users),
            np.random.poisson(9, n_users)
        )
        
        # Return visit within 7 days (1 = yes, 0 = no)
        df['return_visit'] = np.where(
            df['group'] == 'A',
            np.random.binomial(1, 0.30, n_users),
            np.random.binomial(1, 0.45, n_users)
        )
    
    elif config['test_name'] == 'ui_design':
        # Group A: Text-focused UI
        # Group B: Visual UI with charts and graphics
        
        # Task completion (1 = completed, 0 = not completed)
        df['task_completed'] = np.where(
            df['group'] == 'A',
            np.random.binomial(1, 0.65, n_users),
            np.random.binomial(1, 0.75, n_users)
        )
        
        # Time to completion (minutes) - only for those who completed the task
        completion_time_a = np.random.gamma(6, 0.5, n_users)  # Shape, scale
        completion_time_b = np.random.gamma(4, 0.6, n_users)  # Shape, scale
        
        df['time_to_completion'] = np.where(
            df['task_completed'] == 1,
            np.where(df['group'] == 'A', completion_time_a, completion_time_b),
            np.nan
        )
        
        # Satisfaction rating (1-5)
        df['satisfaction_rating'] = np.where(
            df['group'] == 'A',
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.05, 0.15, 0.35, 0.35, 0.10]),
            np.random.choice([1, 2, 3, 4, 5], n_users, p=[0.03, 0.10, 0.25, 0.37, 0.25])
        )
        
        # Information retention (percentage of info retained in follow-up quiz)
        df['info_retention'] = np.where(
            df['group'] == 'A',
            np.random.beta(5, 3, n_users) * 100,  # More cognitive processing with text
            np.random.beta(7, 3, n_users) * 100   # Better retention with visuals
        )
    
    return df

# Simulate data for the language preference test
language_test_data = simulate_ab_test_data(ab_test_configs[0], n_users=1500)
print(f"Simulated data for {ab_test_configs[0]['test_name']} test:")
print(language_test_data.head())
print(f"Shape: {language_test_data.shape}")

In [None]:
def analyze_ab_test(df, metric, group_col='group', alpha=0.05):
    """
    Analyze A/B test results for a specific metric
    
    Parameters:
    - df: DataFrame with test data
    - metric: Column name of the metric to analyze
    - group_col: Column name for the group assignment (default: 'group')
    - alpha: Significance level (default: 0.05)
    
    Returns:
    - Dictionary with analysis results
    """
    # Get data for each group
    group_a = df[df[group_col] == 'A'][metric].dropna()
    group_b = df[df[group_col] == 'B'][metric].dropna()
    
    # Basic statistics
    mean_a = group_a.mean()
    mean_b = group_b.mean()
    median_a = group_a.median()
    median_b = group_b.median()
    std_a = group_a.std()
    std_b = group_b.std()
    
    # Absolute difference and relative lift
    abs_diff = mean_b - mean_a
    rel_lift = (mean_b - mean_a) / mean_a * 100 if mean_a != 0 else float('inf')
    
    # Statistical significance test
    if len(group_a) > 30 and len(group_b) > 30:  # Large sample condition for t-test
        # Use t-test for continuous variables
        t_stat, p_value = stats.ttest_ind(group_a, group_b, equal_var=False)
        test_name = "Welch's t-test"
    else:
        # Use Mann-Whitney U test for non-parametric test
        u_stat, p_value = stats.mannwhitneyu(group_a, group_b)
        test_name = "Mann-Whitney U test"
    
    # Confidence interval for the difference
    ci_low, ci_high = sms.DescrStatsW(group_b).tconfint_mean() - sms.DescrStatsW(group_a).tconfint_mean()
    
    # Effect size - Cohen's d for continuous variables
    pooled_std = np.sqrt((std_a**2 + std_b**2) / 2)
    cohens_d = abs_diff / pooled_std if pooled_std != 0 else float('inf')
    
    # Results dictionary
    results = {
        'metric': metric,
        'mean_a': mean_a,
        'mean_b': mean_b,
        'median_a': median_a,
        'median_b': median_b,
        'std_a': std_a,
        'std_b': std_b,
        'abs_diff': abs_diff,
        'rel_lift': rel_lift,
        'p_value': p_value,
        'significant': p_value < alpha,
        'test_name': test_name,
        'ci_low': ci_low,
        'ci_high': ci_high,
        'cohens_d': cohens_d,
        'sample_size_a': len(group_a),
        'sample_size_b': len(group_b)
    }
    
    return results

# Analyze language preference test results
language_analysis = {}
for metric in ['engagement_time', 'messages_sent', 'task_completed', 'satisfaction_rating']:
    language_analysis[metric] = analyze_ab_test(language_test_data, metric)

# Display results
print("\nLanguage Preference Test Analysis Results:")
for metric, results in language_analysis.items():
    print(f"\nMetric: {metric}")
    print(f"Group A (English only): Mean = {results['mean_a']:.2f}, Median = {results['median_a']:.2f}, Std = {results['std_a']:.2f}")
    print(f"Group B (Bilingual): Mean = {results['mean_b']:.2f}, Median = {results['median_b']:.2f}, Std = {results['std_b']:.2f}")
    print(f"Absolute Difference: {results['abs_diff']:.2f}")
    print(f"Relative Lift: {results['rel_lift']:.2f}%")
    print(f"p-value ({results['test_name']}): {results['p_value']:.4f}")
    print(f"Statistically Significant: {'Yes' if results['significant'] else 'No'}")
    print(f"95% Confidence Interval: [{results['ci_low']:.2f}, {results['ci_high']:.2f}]")
    print(f"Effect Size (Cohen's d): {results['cohens_d']:.2f}")

In [None]:
def segment_analysis(df, metric, segment_col, group_col='group'):
    """
    Analyze A/B test results segmented by a specific column
    
    Parameters:
    - df: DataFrame with test data
    - metric: Column name of the metric to analyze
    - segment_col: Column name for segmentation
    - group_col: Column name for the group assignment (default: 'group')
    
    Returns:
    - DataFrame with segment analysis results
    """
    segments = df[segment_col].unique()
    results = []
    
    for segment in segments:
        segment_df = df[df[segment_col] == segment]
        
        # Skip segments with too few samples
        if len(segment_df) < 30:
            continue
            
        group_a = segment_df[segment_df[group_col] == 'A'][metric].dropna()
        group_b = segment_df[segment_df[group_col] == 'B'][metric].dropna()
        
        # Skip if either group has too few samples
        if len(group_a) < 15 or len(group_b) < 15:
            continue
        
        mean_a = group_a.mean()
        mean_b = group_b.mean()
        abs_diff = mean_b - mean_a
        rel_lift = (mean_b - mean_a) / mean_a * 100 if mean_a != 0 else float('inf')
        
        # Statistical test
        t_stat, p_value = stats.ttest_ind(group_a, group_b, equal_var=False)
        
        results.append({
            'segment': segment,
            'segment_size': len(segment_df),
            'mean_a': mean_a,
            'mean_b': mean_b,
            'abs_diff': abs_diff,
            'rel_lift': rel_lift,
            'p_value': p_value,
            'significant': p_value < 0.05
        })
    
    return pd.DataFrame(results)

In [None]:
# Segmentation by preferred language
language_segments = segment_analysis(language_test_data, 'engagement_time', 'preferred_language')
print("\nSegmentation Analysis for Engagement Time by Language Preference:")
print(language_segments)

In [None]:
# Plot distribution of primary metrics
metrics_to_plot = ['engagement_time', 'satisfaction_rating', 'messages_sent']

fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(18, 6))

for i, metric in enumerate(metrics_to_plot):
    sns.boxplot(x='group', y=metric, data=language_test_data, ax=axes[i])
    axes[i].set_title(f'Distribution of {metric}')
    axes[i].set_xlabel('Test Group')
    axes[i].set_ylabel(metric)

plt.tight_layout()
plt.show()

# Language preference interaction plot
plt.figure(figsize=(12, 8))
sns.barplot(x='preferred_language', y='engagement_time', hue='group', 
            data=language_test_data, ci=68)
plt.title('Engagement Time by Language Preference and Test Group')
plt.xlabel('Preferred Language')
plt.ylabel('Engagement Time (minutes)')
plt.legend(title='Test Group', loc='upper right')
plt.show()

# Task completion rates by group and language
task_completion = language_test_data.groupby(['group', 'preferred_language'])['task_completed'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(x='preferred_language', y='task_completed', hue='group', data=task_completion)
plt.title('Task Completion Rate by Language Preference and Test Group')
plt.xlabel('Preferred Language')
plt.ylabel('Task Completion Rate')
plt.legend(title='Test Group', loc='upper right')
plt.show()

# Satisfaction rating distribution
plt.figure(figsize=(12, 6))
satisfaction_counts = language_test_data.groupby(['group', 'satisfaction_rating']).size().reset_index(name='count')
satisfaction_pct = satisfaction_counts.copy()

# Calculate percentages within each group
for group in ['A', 'B']:
    group_total = satisfaction_counts[satisfaction_counts['group'] == group]['count'].sum()
    satisfaction_pct.loc[satisfaction_pct['group'] == group, 'percentage'] = (
        satisfaction_pct[satisfaction_pct['group'] == group]['count'] / group_total * 100
    )

sns.barplot(x='satisfaction_rating', y='percentage', hue='group', data=satisfaction_pct)
plt.title('Satisfaction Rating Distribution by Test Group')
plt.xlabel('Satisfaction Rating (1-5)')
plt.ylabel('Percentage of Users')
plt.legend(title='Test Group', loc='upper right')
plt.show()

In [None]:
# Interactive engagement time by language preference
fig = px.box(language_test_data, x='preferred_language', y='engagement_time', color='group',
            title='Engagement Time by Language Preference',
            labels={'preferred_language': 'Preferred Language', 
                   'engagement_time': 'Engagement Time (minutes)', 
                   'group': 'Test Group'},
            category_orders={'group': ['A', 'B']},
            color_discrete_map={'A': '#636EFA', 'B': '#EF553B'})

fig.update_layout(legend_title_text='Test Group')
fig.show()

# Interactive task completion rates by segment
task_success_pct = language_test_data.groupby(['group', 'preferred_language', 'age_group'])['task_completed'].mean().reset_index()
task_success_pct['task_completed_pct'] = task_success_pct['task_completed'] * 100

fig = px.bar(task_success_pct, x='preferred_language', y='task_completed_pct', color='group',
            facet_col='age_group', 
            title='Task Completion Rate by Segment',
            labels={'preferred_language': 'Preferred Language', 
                   'task_completed_pct': 'Task Completion Rate (%)', 
                   'group': 'Test Group',
                   'age_group': 'Age Group'},
            category_orders={'group': ['A', 'B']},
            color_discrete_map={'A': '#636EFA', 'B': '#EF553B'})

fig.update_layout(legend_title_text='Test Group')
fig.show()

# Sankey diagram for user flows (Group B only)
group_b_users = language_test_data[language_test_data['group'] == 'B']

# Create flow data
language_counts = group_b_users['preferred_language'].value_counts()
completed_by_language = group_b_users.groupby('preferred_language')['task_completed'].value_counts().unstack().fillna(0)
satisfaction_by_completion = group_b_users.groupby('task_completed')['satisfaction_rating'].value_counts().unstack().fillna(0)

# Define nodes and links
label = ['Group B Users', 'English', 'Swahili', 'Completed Task', 'Didn\'t Complete', 
        'Rating 1', 'Rating 2', 'Rating 3', 'Rating 4', 'Rating 5']

# Node indices for source/target
source = [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4]
target = [1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9]

# Flow values (estimated from our data)
english_users = language_counts.get('English', 0)
swahili_users = language_counts.get('Swahili', 0)
english_completed = completed_by_language.loc['English', 1] if 'English' in completed_by_language.index else 0
english_not_completed = completed_by_language.loc['English', 0] if 'English' in completed_by_language.index else 0
swahili_completed = completed_by_language.loc['Swahili', 1] if 'Swahili' in completed_by_language.index else 0
swahili_not_completed = completed_by_language.loc['Swahili', 0] if 'Swahili' in completed_by_language.index else 0

# Estimated distribution of ratings for demonstration
completed_ratings = [50, 70, 150, 250, 100] # Just example values
not_completed_ratings = [80, 100, 120, 50, 20] # Just example values

value = [english_users, swahili_users, 
        english_completed, english_not_completed,
        swahili_completed, swahili_not_completed,
        *completed_ratings, *not_completed_ratings]

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=label,
        color=["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A", 
              "#19D3F3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52"]
    ),
    link=dict(
        source=source,
        target=target,
        value=value
    ))])

fig.update_layout(title_text="User Flow for Group B (Bilingual Interface)",
                 font_size=12)
fig.show()

In [None]:
def calculate_power(sample_size_per_group, baseline_conversion, minimum_detectable_effect, alpha=0.05):
    """
    Calculate the statistical power for a given sample size and effect
    
    Parameters:
    - sample_size_per_group: Number of users per variant
    - baseline_conversion: Expected conversion rate for control group
    - minimum_detectable_effect: Smallest meaningful difference to detect
    - alpha: Significance level (Type I error probability)
    
    Returns:
    - power: Power of the test (1 - Type II error probability)
    """
    # Standard normal critical value for alpha
    z_alpha = stats.norm.ppf(1 - alpha/2)
    
    # Standard deviations under null and alternative
    sd1 = np.sqrt(2 * baseline_conversion * (1 - baseline_conversion))
    sd2 = np.sqrt(baseline_conversion * (1 - baseline_conversion) + 
                 (baseline_conversion + minimum_detectable_effect) * 
                 (1 - (baseline_conversion + minimum_detectable_effect)))
    
    # Calculate the critical value for beta (Type II error)
    z_beta = (minimum_detectable_effect * np.sqrt(sample_size_per_group) - z_alpha * sd1) / sd2
    
    # Calculate power
    power = stats.norm.cdf(z_beta)
    
    return power

# Calculate power for our tests
print("\nPower Analysis for Actual Sample Sizes:")
for metric, results in language_analysis.items():
    baseline = results['mean_a']
    observed_diff = results['abs_diff']
    
    # Skip if baseline is zero to avoid division by zero
    if baseline == 0:
        continue
        
    mde = observed_diff / 2  # Use half of observed difference as MDE
    sample_size = min(results['sample_size_a'], results['sample_size_b'])
    
    power = calculate_power(sample_size, baseline, mde)
    
    print(f"Metric: {metric}")
    print(f"Sample Size per Group: {sample_size}")
    print(f"Baseline (Group A): {baseline:.4f}")
    print(f"Observed Difference: {observed_diff:.4f}")
    print(f"Minimum Detectable Effect: {mde:.4f}")
    print(f"Statistical Power: {power:.4f}\n")

In [None]:
def regression_analysis(df, target, treatment_col='group', controls=None):
    """
    Perform regression analysis to control for confounding variables
    
    Parameters:
    - df: DataFrame with test data
    - target: Name of the target variable
    - treatment_col: Name of the treatment variable
    - controls: List of control variables to include
    
    Returns:
    - Regression results
    """
    # Create treatment dummy (1 for B, 0 for A)
    df['treatment'] = (df[treatment_col] == 'B').astype(int)
    
    # Prepare formula
    controls = controls or []
    formula = f"{target} ~ treatment"
    
    if controls:
        formula += " + " + " + ".join(controls)
    
    # Fit regression model
    model = sm.OLS.from_formula(formula, data=df).fit()
    
    return model

# Regression for engagement time, controlling for preferred language and age group
engagement_reg = regression_analysis(
    language_test_data, 
    'engagement_time', 
    controls=['C(preferred_language)', 'C(age_group)']
)

print("\nRegression Analysis for Engagement Time:")
print(engagement_reg.summary().tables[1])

# Regression for task completion, controlling for preferred language and age group
task_reg = regression_analysis(
    language_test_data, 
    'task_completed', 
    controls=['C(preferred_language)', 'C(age_group)']
)

print("\nRegression Analysis for Task Completion:")
print(task_reg.summary().tables[1])

In [None]:
print("\nConclusions for Language Preference A/B Test:")
print("===========================================")

# Check if primary metric shows significant improvement
primary_metric = 'engagement_time'
if language_analysis[primary_metric]['significant']:
    print(f"✅ Primary metric ({primary_metric}) shows statistically significant improvement:")
    print(f"   Group B (Bilingual interface) outperforms Group A (English only) by {language_analysis[primary_metric]['rel_lift']:.2f}%")
    print(f"   p-value: {language_analysis[primary_metric]['p_value']:.4f}")
else:
    print(f"❌ Primary metric ({primary_metric}) does not show statistically significant improvement:")
    print(f"   Group B (Bilingual interface) vs Group A (English only): {language_analysis[primary_metric]['rel_lift']:.2f}%")
    print(f"   p-value: {language_analysis[primary_metric]['p_value']:.4f}")

# Check secondary metrics
significant_secondary = []
non_significant_secondary = []

for metric in ['messages_sent', 'task_completed', 'satisfaction_rating']:
    if language_analysis[metric]['significant']:
        significant_secondary.append(f"{metric} (+{language_analysis[metric]['rel_lift']:.2f}%)")
    else:
        non_significant_secondary.append(f"{metric} ({language_analysis[metric]['rel_lift']:.2f}%)")

if significant_secondary:
    print(f"\n✅ Secondary metrics showing significant improvement:")
    for metric in significant_secondary:
        print(f"   - {metric}")

if non_significant_secondary:
    print(f"\n❌ Secondary metrics without significant improvement:")
    for metric in non_significant_secondary:
        print(f"   - {metric}")

# Segment analysis
key_segment = language_segments.sort_values('abs_diff', ascending=False).iloc[0]
print(f"\n👥 Segment Analysis:")
print(f"   - Largest impact observed for {key_segment['segment']} users: {key_segment['rel_lift']:.2f}% improvement")

# Final recommendation
if language_analysis[primary_metric]['significant'] and language_analysis[primary_metric]['rel_lift'] > 5:
    print("\n🚀 RECOMMENDATION: Implement Bilingual Interface (Group B)")
    print(f"   The bilingual interface significantly improves engagement time by {language_analysis[primary_metric]['rel_lift']:.2f}%,")
    print(f"   particularly for Swahili-speaking users. This demonstrates the importance of language")
    print(f"   localization for PesaGuru's target audience in Kenya.")
elif language_analysis[primary_metric]['significant']:
    print("\n🤔 RECOMMENDATION: Consider Implementing Bilingual Interface (Group B)")
    print(f"   While the improvement is statistically significant, the effect size ({language_analysis[primary_metric]['rel_lift']:.2f}%) is modest.")
    print(f"   Consider the development costs versus the expected user engagement benefits.")
else:
    print("\n⚠️ RECOMMENDATION: Further Testing Required")
    print(f"   The results are inconclusive. Consider a larger sample size or different implementation approach.")

print("\n📊 Action Items:")
print("   1. Implement bilingual interface with option to toggle between English and Swahili")
print("   2. Monitor impact on actual user engagement and conversion metrics")
print("   3. Consider more localization features beyond language (e.g., local financial terms)")
print("   4. Develop educational content in both languages")

In [None]:
print("\nProposed Future A/B Tests for PesaGuru:")
print("====================================")
print("1. Response Style Test: Compare formal vs. conversational tone")
print("2. Recommendation Algorithm: Rule-based vs. ML-based investment suggestions")
print("3. UI Design: Text-focused vs. Visualization-rich interface")
print("4. Onboarding Flow: Quick start vs. Guided tutorial")
print("5. Notification Strategy: Frequency and content of financial alerts")

# Save results for future reference
results_summary = {
    'test_name': ab_test_configs[0]['test_name'],
    'test_date': datetime.now().strftime("%Y-%m-%d"),
    'primary_metric': primary_metric,
    'primary_metric_results': language_analysis[primary_metric],
    'secondary_metrics': {m: language_analysis[m] for m in ['messages_sent', 'task_completed', 'satisfaction_rating']},
    'segment_analysis': language_segments.to_dict(),
    'recommendation': 'Implement Bilingual Interface' if language_analysis[primary_metric]['significant'] else 'Further Testing Required'
}

print("\nSaving results summary to file...")
with open('ab_test_results_language_preference.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("A/B Testing complete! Results saved for review.")