# Data Generation

## Dataset Description

Since we don't have real data, we've made a **synthetic dataset** that simulates user behavior based on realistic assumptions and probability distributions.


### Key Columns
- **user_id**: A unique identifier for each user.
- **group**: Either 'control' or 'experiment', indicating whether the user belongs to the control group or the experiment group.
- **session_date**: The date and time of the user's session.
- **product_views**: The number of products viewed by the user during the session.
- **cart_adds**: The number of items added to the cart.
- **purchase_amount**: The total amount spent by the user in the session (if any purchase was made).
- **session_duration**: The duration of the session in minutes.
- **device_type**: The type of device used by the user (mobile, desktop, or tablet).
- **traffic_source**: The source of traffic that brought the user to the site (organic, paid ad, or direct).
- **region**: The region where the user is located (Estonia, Latvia, Lithuania).
- **visitor_type**: Whether the user is a "new" or "old" visitor (new or returning customer).

## Import Libraries 

In [1]:
import numpy as np
import pandas as pd

In [5]:
def generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean):
    # List of experiment days
    date_range = pd.date_range(start=start_date, end=end_date, freq='h')
    
    # Generate session dates
    def generate_session_dates(n_sessions):
        return np.random.choice(date_range, size=n_sessions, replace=True)

    # Probabilistic distributions
    product_views_dist = np.random.poisson(5, size=n_control + n_experiment)
    cart_adds_dist = np.random.poisson(2, size=n_control + n_experiment)
    session_duration_dist = np.random.exponential(scale=10, size=n_control + n_experiment)  # Exponential distribution for session duration
    
    # Distributions for device_type, traffic_source, region, visitor_type
    device_type_dist = np.random.choice(['mobile', 'desktop', 'tablet'], p=[0.7, 0.25, 0.05], size=n_control + n_experiment)
    traffic_source_dist = np.random.choice(['organic', 'paid_ad', 'direct'], p=[0.5, 0.3, 0.2], size=n_control + n_experiment)
    region_dist = np.random.choice(['Estonia', 'Latvia', 'Lithuania'], p=[0.3, 0.4, 0.3], size=n_control + n_experiment)
    
    # Adding distribution for visitor_type (e.g., 30% new, 70% old)
    visitor_type_dist = np.random.choice(['new', 'old'], p=[0.3, 0.7], size=n_control + n_experiment)

    # Mixed distribution for purchase_amount
    def generate_mixed_distribution(size, conversion_rate, arpu_mean):
        # Exponential distribution for small purchases
        small_orders = np.random.exponential(scale=arpu_mean / 3, size=size)
        # Log-normal distribution for large purchases
        large_orders = np.random.lognormal(mean=np.log(arpu_mean), sigma=0.5, size=size)

        # Randomly determine whether to use a small or large order (e.g., 70% small, 30% large)
        mix = np.random.choice([0, 1], p=[0.7, 0.3], size=size)

        # Final distribution, choosing between small and large orders
        final_orders = np.where(mix == 0, small_orders, large_orders)
        return np.where(np.random.rand(size) < conversion_rate, final_orders, 0)

    # Generate purchase_amount for control and experiment groups
    control_purchase_amount = generate_mixed_distribution(n_control, control_conversion_rate, control_arpu_mean)
    experiment_purchase_amount = generate_mixed_distribution(n_experiment, experiment_conversion_rate, experiment_arpu_mean)

    # Generate data for the control group
    control_data = {
        'user_id': np.arange(1, n_control + 1),
        'group': ['control'] * n_control,
        'session_date': generate_session_dates(n_control),
        'product_views': product_views_dist[:n_control],
        'cart_adds': cart_adds_dist[:n_control],
        'purchase_amount': control_purchase_amount,
        'session_duration': session_duration_dist[:n_control],
        'device_type': device_type_dist[:n_control],
        'traffic_source': traffic_source_dist[:n_control],
        'region': region_dist[:n_control],
        'visitor_type': visitor_type_dist[:n_control]
    }

    # Generate data for the experiment group
    experiment_data = {
        'user_id': np.arange(n_control + 1, n_control + n_experiment + 1),
        'group': ['experiment'] * n_experiment,
        'session_date': generate_session_dates(n_experiment),
        'product_views': product_views_dist[n_control:],
        'cart_adds': cart_adds_dist[n_control:],
        'purchase_amount': experiment_purchase_amount,
        'session_duration': session_duration_dist[n_control:],
        'device_type': device_type_dist[n_control:],
        'traffic_source': traffic_source_dist[n_control:],
        'region': region_dist[n_control:],
        'visitor_type': visitor_type_dist[n_control:]
    }

    # Convert to DataFrame
    df_control = pd.DataFrame(control_data)
    df_experiment = pd.DataFrame(experiment_data)
    
    # Combine control and experiment groups
    df = pd.concat([df_control, df_experiment], ignore_index=True)
    
    # Save the dataset
    df.to_csv('rimi_ab_test.csv', index=False)

    return df



In [25]:
# Set parameters for the control and experiment groups
n_control = 4620
n_experiment = 4620
start_date = '2024-08-05'
end_date = '2024-08-14'
control_conversion_rate = 0.041  # Conversion probability for the control group
experiment_conversion_rate = 0.046  # Conversion probability for the experiment group
control_arpu_mean = 90  # Average ARPU for the control group
experiment_arpu_mean = 95  # Average ARPU for the experiment group

# Generate data
df = generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean)

# Example of the first rows of the dataset
df.head()

Unnamed: 0,user_id,group,session_date,product_views,cart_adds,purchase_amount,session_duration,device_type,traffic_source,region,visitor_type
0,1,control,2024-08-05 15:00:00,8,4,0.0,2.975904,desktop,paid_ad,Latvia,old
1,2,control,2024-08-09 21:00:00,3,3,0.0,0.431187,mobile,direct,Latvia,old
2,3,control,2024-08-07 06:00:00,6,2,208.641245,4.284309,mobile,direct,Latvia,old
3,4,control,2024-08-10 14:00:00,6,4,0.0,40.617022,mobile,organic,Latvia,new
4,5,control,2024-08-06 01:00:00,3,4,0.0,31.487541,mobile,organic,Estonia,old


## Data Cleaning

In [2]:
df = pd.read_csv('rimi_ab_test.csv')
df.head()

Unnamed: 0,user_id,group,session_date,product_views,cart_adds,purchase_amount,session_duration,device_type,traffic_source,region,visitor_type
0,1,control,2024-08-10 23:00:00,3,1,23.86245,29.8578,desktop,direct,Latvia,old
1,2,control,2024-08-13 11:00:00,5,1,0.0,7.559951,mobile,organic,Latvia,new
2,3,control,2024-08-05 01:00:00,4,2,72.419561,16.423385,mobile,organic,Estonia,old
3,4,control,2024-08-13 06:00:00,4,0,0.0,47.790197,mobile,direct,Estonia,old
4,5,control,2024-08-12 07:00:00,4,2,0.0,7.942583,mobile,paid_ad,Lithuania,old


In [4]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Round column 'purchase_amount' (Number of decimals: 2)
    df = df.round({'purchase_amount': 2})
    # Round column 'session_duration' (Number of decimals: 2)
    df = df.round({'session_duration': 2})
    return df

df_clean = clean_data(df.copy())
df_clean.to_csv('rimi_ab_test.csv', index=False)
df_clean.head()

Unnamed: 0,user_id,group,session_date,product_views,cart_adds,purchase_amount,session_duration,device_type,traffic_source,region,visitor_type
0,1,control,2024-08-10 23:00:00,3,1,23.86,29.86,desktop,direct,Latvia,old
1,2,control,2024-08-13 11:00:00,5,1,0.0,7.56,mobile,organic,Latvia,new
2,3,control,2024-08-05 01:00:00,4,2,72.42,16.42,mobile,organic,Estonia,old
3,4,control,2024-08-13 06:00:00,4,0,0.0,47.79,mobile,direct,Estonia,old
4,5,control,2024-08-12 07:00:00,4,2,0.0,7.94,mobile,paid_ad,Lithuania,old


In [6]:
import numpy as np
import pandas as pd
from scipy.stats import anderson, shapiro, levene, mannwhitneyu, ttest_ind, chisquare
import time

# Function to generate the A/B test dataset
def generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean):
    date_range = pd.date_range(start=start_date, end=end_date, freq='h')
    
    def generate_session_dates(n_sessions):
        return np.random.choice(date_range, size=n_sessions, replace=True)

    product_views_dist = np.random.poisson(5, size=n_control + n_experiment)
    cart_adds_dist = np.random.poisson(2, size=n_control + n_experiment)
    session_duration_dist = np.random.exponential(scale=10, size=n_control + n_experiment)
    device_type_dist = np.random.choice(['mobile', 'desktop', 'tablet'], p=[0.7, 0.25, 0.05], size=n_control + n_experiment)
    traffic_source_dist = np.random.choice(['organic', 'paid_ad', 'direct'], p=[0.5, 0.3, 0.2], size=n_control + n_experiment)
    region_dist = np.random.choice(['Estonia', 'Latvia', 'Lithuania'], p=[0.3, 0.4, 0.3], size=n_control + n_experiment)
    visitor_type_dist = np.random.choice(['new', 'old'], p=[0.3, 0.7], size=n_control + n_experiment)

    def generate_mixed_distribution(size, conversion_rate, arpu_mean):
        small_orders = np.random.exponential(scale=arpu_mean / 3, size=size)
        large_orders = np.random.lognormal(mean=np.log(arpu_mean), sigma=0.5, size=size)
        mix = np.random.choice([0, 1], p=[0.7, 0.3], size=size)
        final_orders = np.where(mix == 0, small_orders, large_orders)
        return np.where(np.random.rand(size) < conversion_rate, final_orders, 0)

    control_purchase_amount = generate_mixed_distribution(n_control, control_conversion_rate, control_arpu_mean)
    experiment_purchase_amount = generate_mixed_distribution(n_experiment, experiment_conversion_rate, experiment_arpu_mean)

    control_data = {
        'user_id': np.arange(1, n_control + 1),
        'group': ['control'] * n_control,
        'session_date': generate_session_dates(n_control),
        'product_views': product_views_dist[:n_control],
        'cart_adds': cart_adds_dist[:n_control],
        'purchase_amount': control_purchase_amount,
        'session_duration': session_duration_dist[:n_control],
        'device_type': device_type_dist[:n_control],
        'traffic_source': traffic_source_dist[:n_control],
        'region': region_dist[:n_control],
        'visitor_type': visitor_type_dist[:n_control]
    }

    experiment_data = {
        'user_id': np.arange(n_control + 1, n_control + n_experiment + 1),
        'group': ['experiment'] * n_experiment,
        'session_date': generate_session_dates(n_experiment),
        'product_views': product_views_dist[n_control:],
        'cart_adds': cart_adds_dist[n_control:],
        'purchase_amount': experiment_purchase_amount,
        'session_duration': session_duration_dist[n_control:],
        'device_type': device_type_dist[n_control:],
        'traffic_source': traffic_source_dist[n_control:],
        'region': region_dist[n_control:],
        'visitor_type': visitor_type_dist[n_control:]
    }

    df_control = pd.DataFrame(control_data)
    df_experiment = pd.DataFrame(experiment_data)
    df = pd.concat([df_control, df_experiment], ignore_index=True)
    df.to_csv('rimi_ab_test.csv', index=False)
    return df


# Updated function to include detailed output on success
def analyze_ab_test(file_path):
    data = pd.read_csv(file_path)
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=["Unnamed: 0"])
    
    control_group = data[data['group'] == 'control']
    experiment_group = data[data['group'] == 'experiment']
    new_visitors = data[data['visitor_type'] == 'new']
    recurrent_visitors = data[data['visitor_type'] == 'old']
    metrics = ['product_views', 'cart_adds', 'purchase_amount', 'session_duration']
    
    # Store results of each test
    results = {
        "selection_bias": {},
        "sample_ratio_mismatch": {},
        "novelty_effect": {}
    }
    
    # Selection Bias Check
    for metric in metrics:
        is_passed = check_metric(control_group[metric], experiment_group[metric])
        results['selection_bias'][metric] = "Pass" if is_passed else "Fail"
        if not is_passed:
            return False  # Failure: Selection Bias
    
    # Sample Ratio Mismatch Check
    control_count = len(control_group)
    experiment_count = len(experiment_group)
    expected = [control_count, experiment_count]
    observed = [len(data[data['group'] == 'control']), len(data[data['group'] == 'experiment'])]
    chi2, p_value_srm = chisquare(f_obs=observed, f_exp=expected)
    results['sample_ratio_mismatch'] = {
        "chi_square_stat": chi2,
        "p_value": p_value_srm,
        "status": "Pass" if p_value_srm >= 0.05 else "Fail"
    }
    
    if p_value_srm < 0.05:
        return False  # Failure: Sample Ratio Mismatch
    
    # Novelty Effect Check
    for metric in metrics:
        is_passed = check_metric(new_visitors[metric], recurrent_visitors[metric])
        results['novelty_effect'][metric] = "Pass" if is_passed else "Fail"
        if not is_passed:
            return False  # Failure: Novelty Effect
    
    # Output detailed success information
    print("Success! Dataset meets all criteria.")
    
    # Print Selection Bias results
    print("\n### Selection Bias Check Results")
    for metric, status in results['selection_bias'].items():
        print(f"{metric}: {status}")

    # Print Sample Ratio Mismatch results
    print("\n### Sample Ratio Mismatch Check Results")
    print(f"Chi-Square Statistic: {results['sample_ratio_mismatch']['chi_square_stat']:.4f}")
    print(f"P-Value: {results['sample_ratio_mismatch']['p_value']:.4f}")
    print(f"Status: {results['sample_ratio_mismatch']['status']}")

    # Print Novelty Effect results
    print("\n### Novelty Effect Check Results")
    for metric, status in results['novelty_effect'].items():
        print(f"{metric}: {status}")

    # Calculate and print Conversion Rates and ARPU
    control_conversion = np.sum(control_group['purchase_amount'] > 0) / len(control_group)
    experiment_conversion = np.sum(experiment_group['purchase_amount'] > 0) / len(experiment_group)
    
    control_arpu = control_group[control_group['purchase_amount'] > 0]['purchase_amount'].mean()
    experiment_arpu = experiment_group[experiment_group['purchase_amount'] > 0]['purchase_amount'].mean()

    print(f"\nControl group conversion rate: {control_conversion * 100:.2f}%")
    print(f"Experiment group conversion rate: {experiment_conversion * 100:.2f}%")
    print(f"ARPU for Control Group (excluding non-purchasers): {control_arpu:.2f}")
    print(f"ARPU for Experiment Group (excluding non-purchasers): {experiment_arpu:.2f}")
    
    return True  # All tests passed


# Function to check assumptions and perform appropriate test
def check_metric(metric_data_1, metric_data_2):
    def check_normality(data):
        if len(data) > 5000:
            ad_test = anderson(data)
            return ad_test.statistic < ad_test.critical_values[2]
        else:
            shapiro_test = shapiro(data)
            return shapiro_test.pvalue > 0.05

    normal_1 = check_normality(metric_data_1)
    normal_2 = check_normality(metric_data_2)
    levene_test = levene(metric_data_1, metric_data_2)
    
    if normal_1 and normal_2 and levene_test.pvalue > 0.05:
        t_stat, p_value = ttest_ind(metric_data_1, metric_data_2, equal_var=True)
    elif normal_1 and normal_2 and levene_test.pvalue <= 0.05:
        t_stat, p_value = ttest_ind(metric_data_1, metric_data_2, equal_var=False)
    else:
        t_stat, p_value = mannwhitneyu(metric_data_1, metric_data_2)
    
    return p_value >= 0.05  # True if no significant difference


# Re-rolling loop to generate and validate the dataset until successful
while True:
    n_control = 2790
    n_experiment = 2790
    start_date = '2024-08-05'
    end_date = '2024-08-14'
    control_conversion_rate = 0.041
    experiment_conversion_rate = 0.046
    control_arpu_mean = 90
    experiment_arpu_mean = 95

    df = generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean)
    
    success = analyze_ab_test('rimi_ab_test.csv')
    
    if success:
        break
    else:
        time.sleep(0.1)  # Reduced wait time for faster rerolling



Success! Dataset meets all criteria.

### Selection Bias Check Results
product_views: Pass
cart_adds: Pass
purchase_amount: Pass
session_duration: Pass

### Sample Ratio Mismatch Check Results
Chi-Square Statistic: 0.0000
P-Value: 1.0000
Status: Pass

### Novelty Effect Check Results
product_views: Pass
cart_adds: Pass
purchase_amount: Pass
session_duration: Pass

Control group conversion rate: 3.98%
Experiment group conversion rate: 4.34%
ARPU for Control Group (excluding non-purchasers): 51.38
ARPU for Experiment Group (excluding non-purchasers): 64.69
