# Data Generation

## Dataset Description

Since we don't have real data, we've made a **synthetic dataset** that simulates user behavior based on realistic assumptions and probability distributions.


### Key Columns
- **user_id**: A unique identifier for each user.
- **group**: Either 'control' or 'experiment', indicating whether the user belongs to the control group or the experiment group.
- **session_date**: The date and time of the user's session.
- **product_views**: The number of products viewed by the user during the session.
- **cart_adds**: The number of items added to the cart.
- **purchase_amount**: The total amount spent by the user in the session (if any purchase was made).
- **session_duration**: The duration of the session in minutes.
- **device_type**: The type of device used by the user (mobile, desktop, or tablet).
- **traffic_source**: The source of traffic that brought the user to the site (organic, paid ad, or direct).
- **region**: The region where the user is located (Estonia, Latvia, Lithuania).
- **visitor_type**: Whether the user is a "new" or "old" visitor (new or returning customer).

## Import Libraries 

In [34]:
import numpy as np
import pandas as pd

In [35]:
def generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean):
    # List of experiment days
    date_range = pd.date_range(start=start_date, end=end_date, freq='h')
    
    # Generate session dates
    def generate_session_dates(n_sessions):
        return np.random.choice(date_range, size=n_sessions, replace=True)

    # Probabilistic distributions
    product_views_dist = np.random.poisson(5, size=n_control + n_experiment)
    cart_adds_dist = np.random.poisson(2, size=n_control + n_experiment)
    session_duration_dist = np.random.exponential(scale=10, size=n_control + n_experiment)  # Exponential distribution for session duration
    
    # Distributions for device_type, traffic_source, region, visitor_type
    device_type_dist = np.random.choice(['mobile', 'desktop', 'tablet'], p=[0.7, 0.25, 0.05], size=n_control + n_experiment)
    traffic_source_dist = np.random.choice(['organic', 'paid_ad', 'direct'], p=[0.5, 0.3, 0.2], size=n_control + n_experiment)
    region_dist = np.random.choice(['Estonia', 'Latvia', 'Lithuania'], p=[0.3, 0.4, 0.3], size=n_control + n_experiment)
    
    # Adding distribution for visitor_type (e.g., 30% new, 70% old)
    visitor_type_dist = np.random.choice(['new', 'old'], p=[0.3, 0.7], size=n_control + n_experiment)

    # Mixed distribution for purchase_amount
    def generate_mixed_distribution(size, conversion_rate, arpu_mean):
        # Exponential distribution for small purchases
        small_orders = np.random.exponential(scale=arpu_mean / 3, size=size)
        # Log-normal distribution for large purchases
        large_orders = np.random.lognormal(mean=np.log(arpu_mean), sigma=0.5, size=size)

        # Randomly determine whether to use a small or large order (e.g., 70% small, 30% large)
        mix = np.random.choice([0, 1], p=[0.7, 0.3], size=size)

        # Final distribution, choosing between small and large orders
        final_orders = np.where(mix == 0, small_orders, large_orders)
        return np.where(np.random.rand(size) < conversion_rate, final_orders, 0)

    # Generate purchase_amount for control and experiment groups
    control_purchase_amount = generate_mixed_distribution(n_control, control_conversion_rate, control_arpu_mean)
    experiment_purchase_amount = generate_mixed_distribution(n_experiment, experiment_conversion_rate, experiment_arpu_mean)

    # Generate data for the control group
    control_data = {
        'user_id': np.arange(1, n_control + 1),
        'group': ['control'] * n_control,
        'session_date': generate_session_dates(n_control),
        'product_views': product_views_dist[:n_control],
        'cart_adds': cart_adds_dist[:n_control],
        'purchase_amount': control_purchase_amount,
        'session_duration': session_duration_dist[:n_control],
        'device_type': device_type_dist[:n_control],
        'traffic_source': traffic_source_dist[:n_control],
        'region': region_dist[:n_control],
        'visitor_type': visitor_type_dist[:n_control]
    }

    # Generate data for the experiment group
    experiment_data = {
        'user_id': np.arange(n_control + 1, n_control + n_experiment + 1),
        'group': ['experiment'] * n_experiment,
        'session_date': generate_session_dates(n_experiment),
        'product_views': product_views_dist[n_control:],
        'cart_adds': cart_adds_dist[n_control:],
        'purchase_amount': experiment_purchase_amount,
        'session_duration': session_duration_dist[n_control:],
        'device_type': device_type_dist[n_control:],
        'traffic_source': traffic_source_dist[n_control:],
        'region': region_dist[n_control:],
        'visitor_type': visitor_type_dist[n_control:]
    }

    # Convert to DataFrame
    df_control = pd.DataFrame(control_data)
    df_experiment = pd.DataFrame(experiment_data)
    
    # Combine control and experiment groups
    df = pd.concat([df_control, df_experiment], ignore_index=True)
    
    # Save the dataset
    df.to_csv('rimi_ab_test.csv', index=False)

    return df



In [71]:
# Set parameters for the control and experiment groups
n_control = 4620
n_experiment = 4620
start_date = '2024-08-05'
end_date = '2024-08-14'
control_conversion_rate = 0.041  # Conversion probability for the control group
experiment_conversion_rate = 0.046  # Conversion probability for the experiment group
control_arpu_mean = 90  # Average ARPU for the control group
experiment_arpu_mean = 95  # Average ARPU for the experiment group

# Generate data
df = generate_ab_test_data(n_control, n_experiment, start_date, end_date, control_conversion_rate, experiment_conversion_rate, control_arpu_mean, experiment_arpu_mean)

# Example of the first rows of the dataset
df.head()

Unnamed: 0,user_id,group,session_date,product_views,cart_adds,purchase_amount,session_duration,device_type,traffic_source,region,visitor_type
0,1,control,2024-08-12 16:00:00,4,1,0.0,7.034356,desktop,organic,Latvia,old
1,2,control,2024-08-13 04:00:00,1,1,0.0,16.918016,mobile,organic,Estonia,old
2,3,control,2024-08-12 11:00:00,9,0,0.0,0.450504,mobile,paid_ad,Lithuania,old
3,4,control,2024-08-13 23:00:00,4,0,0.0,6.555004,desktop,organic,Latvia,new
4,5,control,2024-08-07 01:00:00,3,5,0.0,4.288196,mobile,organic,Latvia,new


In [72]:
from scipy.stats import chi2_contingency
from scipy import stats
import pandas as pd
from scipy.stats import mannwhitneyu

df = pd.read_csv('rimi_ab_test.csv')
# Create a 'conversion' column based on the presence of a purchase
df['conversion'] = df['purchase_amount'].apply(lambda x: 1 if x > 0 else 0)

# Separate the data into control and experiment groups
control_group = df[df['group'] == 'control']
experiment_group = df[df['group'] == 'experiment']

# Calculate conversion rates for both groups
control_conversion_rate = control_group['conversion'].mean()
experiment_conversion_rate = experiment_group['conversion'].mean()


# Define metrics for A/A test
metrics = ['product_views', 'cart_adds', 'purchase_amount', 'session_duration']

# Perform A/A test using t-test for each metric
aa_test_results = {}
for metric in metrics:
    t_stat, p_value = stats.ttest_ind(control_group[metric], experiment_group[metric], equal_var=False)
    aa_test_results[metric] = {"t_stat": t_stat, "p_value": p_value}


#--

# Perform Chi-Square test for conversion
conversion_table = pd.crosstab(df['group'], df['conversion'])
chi2_stat, p_value, dof, expected = chi2_contingency(conversion_table)
print(f"Chi-Square test results: \nStatistic = {chi2_stat}, \np-value = {p_value}")

print(f"\nControl group conversion rate: {control_conversion_rate * 100:.2f}%")
print(f"Experiment group conversion rate: {experiment_conversion_rate * 100:.2f}%")

print("'\nproduct_views':", aa_test_results['product_views'])
print("'cart_adds':", aa_test_results['cart_adds'])
print("'purchase_amount':", aa_test_results['purchase_amount'])
print("'session_duration':", aa_test_results['session_duration'])


##----------


# Filter out only rows with purchases (purchase_amount > 0) if that's the criterion you want to use
purchase_data = df[df['purchase_amount'] > 0]

# Separate control and experiment group data
control_group_purchases = purchase_data[purchase_data['group'] == 'control']['purchase_amount']
experiment_group_purchases = purchase_data[purchase_data['group'] == 'experiment']['purchase_amount']

# Perform Mann-Whitney U test
u_statistic, p_value_mw = mannwhitneyu(control_group_purchases, experiment_group_purchases, alternative='two-sided')

# Output the result
print(f"\nMann-Whitney U Test: \nU-statistic = {u_statistic}, \np-value = {p_value_mw}")

print(f"\nARPU for Control Group (excluding non-purchasers): {control_group_purchases.mean()}")
print(f"ARPU for Experiment Group (excluding non-purchasers): {experiment_group_purchases.mean()}")

Chi-Square test results: 
Statistic = 5.005639403485943, 
p-value = 0.025264869271318418

Control group conversion rate: 4.09%
Experiment group conversion rate: 5.09%
'
product_views': {'t_stat': -0.004698510153908123, 'p_value': 0.9962512465348898}
'cart_adds': {'t_stat': 0.767527862336702, 'p_value': 0.4427873523800919}
'purchase_amount': {'t_stat': -1.030619343349202, 'p_value': 0.3027464428138045}
'session_duration': {'t_stat': -0.3655564328955553, 'p_value': 0.7147044693584548}

Mann-Whitney U Test: 
U-statistic = 23457.0, 
p-value = 0.3193232155654122

ARPU for Control Group (excluding non-purchasers): 57.29474942055487
ARPU for Experiment Group (excluding non-purchasers): 53.046041600490064
