In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ab-test-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2479030%2F4213024%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240330%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240330T225425Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5f0b86438c0438cdc8ee57c072f48e97fdd4d00520b3ba0f962fb3a7a3578c8f1e855972d2ca87b7dd961d5a62208d7b1407eeb74c7d2a2ed84bcaf1e412e953dc50c86a235fb926ad19af56a7c16ee4a06171c7c33d131b9a2ad6235532a9b454c3f278b72fbd7178fdddaf2e591292c477c9e2bd0a8483b31158c306207a768d97e67bb867139040d8978221a23dd793dd104cc3d85f2e09a7742959d6985e9a7df62e3cc561e1752eac2ec707517e5ec8ad98baaee327e8c3eec01b414d8202bf6ea940cbab62cce4dfef5bf0bc262976b2d3f150eee8835f58f7306037ccc81c48189acb74cae0f45ecab679039f649b7819e1e1820d1f67937886264707'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Failed to load (likely expired) https://storage.googleapis.com/kaggle-data-sets/2479030/4213024/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240330%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240330T225425Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=5f0b86438c0438cdc8ee57c072f48e97fdd4d00520b3ba0f962fb3a7a3578c8f1e855972d2ca87b7dd961d5a62208d7b1407eeb74c7d2a2ed84bcaf1e412e953dc50c86a235fb926ad19af56a7c16ee4a06171c7c33d131b9a2ad6235532a9b454c3f278b72fbd7178fdddaf2e591292c477c9e2bd0a8483b31158c306207a768d97e67bb867139040d8978221a23dd793dd104cc3d85f2e09a7742959d6985e9a7df62e3cc561e1752eac2ec707517e5ec8ad98baaee327e8c3eec01b414d8202bf6ea940cbab62cce4dfef5bf0bc262976b2d3f150eee8835f58f7306037ccc81c48189acb74cae0f45ecab679039f649b7819e1e1820d1f67937886264707 to path /kaggle/input/ab-test-data
Data source import complete.


The AB test is a randomized experiment that is used by most large companies to evaluate the launch of new functionality. Various difficulties may arise in the analysis and conduct of the experiment. Several typical problematic cases from real life are given in this dataset and analysis.

### Content
1. [Loading and processing data](#Loading)
2. [Statistical Analysis](#Statistical)
3. [Conclusions](#Conclusions)

### Loading and processing data <a name="Loading"/>

Let's import the necessary libraries and load the data

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import seaborn as sns
from scipy.stats import pearsonr
from scipy.stats import shapiro

In [None]:
path = '/kaggle/input/ab-test-data/AB_Test_Results.csv'
df = pd.read_csv(path)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/ab-test-data/AB_Test_Results.csv'

In [None]:
# TODO : take a quick look at the data

df.info()
df.describe()
df.head()
df.columns()
df.shape()
df.isnull().sum() #or df.dropna() depending on the dataset
df.drop_duplicates()

Let's see if there are users who have two A/B test groups

In [None]:
# TODO : Look at the distribution of the number of unique groups for each user

import matplotlib.pyplot as plt
import seaborn as sns

unique_groups_per_user = df.groupby('user')['group'].nunique()

print("Distribution of Unique Groups per User:")
print(unique_groups_per_user.describe())

plt.figure(figsize=(10, 6))
plt.hist(unique_groups_per_user, bins=range(1, unique_groups_per_user.max() + 2), edgecolor='black')
plt.title('Distribution of Number of Unique Groups per User')
plt.xlabel('Number of Unique Groups')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# TODO : Look at the distribution of the number of unique groups for each user in relative terms


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

unique_groups_per_user = df.groupby('user')['group'].nunique()
frequency = unique_groups_per_user.value_counts()
relative_frequency = frequency / frequency.sum()

print("Relative Distribution of Unique Groups per User:")
print(relative_frequency)

plt.figure(figsize=(10, 6))
plt.bar(relative_frequency.index, relative_frequency.values, edgecolor='black')
plt.title('Relative Distribution of Number of Unique Groups per User')
plt.xlabel('Number of Unique Groups')
plt.ylabel('Proportion')
plt.xticks(relative_frequency.index)
plt.grid(True, axis='y')
plt.show()

What can you observe ? Should we exclude these users ? Why ?

In [None]:
# TODO : users have 1 A/B test group ?

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1]
print(f"Number of users with exactly one A/B test group: {len(users_with_one_group)}")
print("\nSample of users with exactly one A/B test group:")
print(users_with_one_group.head())

# TODO : Take only these users

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
print("Filtered DataFrame Info:")
print(filtered_df.info())
print("\nSample of Filtered DataFrame:")
print(filtered_df.head())

In [None]:
# TODO : Leave only records with users with the 1st ab test group

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
print("Final Filtered DataFrame Info:")
print(final_filtered_df.info())
print("\nSample of Final Filtered DataFrame:")
print(final_filtered_df.head())


# TODO : Check that the number of users matches what it was before the exception

unique_groups_per_user = final_filtered_df.groupby('user')['group'].nunique()
print(f"Number of users with exactly one A/B test group: {len(unique_groups_per_user)}")

initial_user_count = df['user'].nunique()
final_user_count = final_filtered_df['user'].nunique()
print(f"Initial number of unique users: {initial_user_count}")
print(f"Final number of unique users: {final_user_count}")
unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
final_user_count = final_filtered_df['user'].nunique()
print(f"Initial number of unique users: {initial_user_count}")
print(f"Number of unique users after final filtering: {final_user_count}")

if initial_user_count == final_user_count:
    print("The number of unique users matches before and after filtering.")
else:
    print("The number of unique users does not match before and after filtering.")

Let's see how the data is distributed

In [None]:
# TODO : Check how the data is distributed using a box plot

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
numerical_column = 'some_numeric_column'
plt.figure(figsize=(10, 6))
sns.boxplot(data=final_filtered_df, x='group', y=numerical_column)
plt.title('Box Plot of Numerical Column Distribution')
plt.xlabel('A/B Test Group')
plt.ylabel('Values of Numerical Column')
plt.grid(True)
plt.show()

It can be seen that there is a strong outlier in the data - we will find it by sorting these revenue values in descending order

In [None]:
# TODO : Sort these revenue values in descending order

import pandas as pd

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
revenue_column = 'revenue'
sorted_revenue_df = final_filtered_df.sort_values(by=revenue_column, ascending=False)
print("Sorted Revenue Values:")
print(sorted_revenue_df[[revenue_column]].head())

In [None]:
# TODO : See if user 3342 had other events with a strong outlier

import seaborn as sns
import matplotlib.pyplot as plt

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

user_df = final_filtered_df[final_filtered_df['user'] == 3342]
numerical_column = 'some_numeric_column'

plt.figure(figsize=(10, 6))
sns.boxplot(x=user_df[numerical_column])
plt.title(f'Box Plot of {numerical_column} for User 3342')
plt.xlabel(numerical_column)
plt.grid(True)
plt.show()

Q1 = user_df[numerical_column].quantile(0.25)
Q3 = user_df[numerical_column].quantile(0.75)
IQR = Q3 - Q1
outliers = user_df[(user_df[numerical_column] < (Q1 - 1.5 * IQR)) | (user_df[numerical_column] > (Q3 + 1.5 * IQR))]
print(f"Number of strong outliers for user 3342: {outliers.shape[0]}")
print("Strong outliers:")
print(outliers[[numerical_column]])

We see that there is only one outlier - in conditions of limited information, we will remove this entry and look at the distribution of data again

In [None]:
# TODO : We see that there is only one outlier - in conditions of limited information, remove this entry and visualize the distribution of data again.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
user_df = final_filtered_df[final_filtered_df['user'] == 3342]
numerical_column = 'some_numeric_column'
plt.figure(figsize=(10, 6))
sns.boxplot(x=user_df[numerical_column])
plt.title(f'Box Plot of {numerical_column} for User 3342')
plt.xlabel(numerical_column)
plt.grid(True)
plt.show()

Q1 = user_df[numerical_column].quantile(0.25)
Q3 = user_df[numerical_column].quantile(0.75)
IQR = Q3 - Q1

outliers = user_df[(user_df[numerical_column] < (Q1 - 1.5 * IQR)) | (user_df[numerical_column] > (Q3 + 1.5 * IQR))]
cleaned_user_df = user_df[~user_df.index.isin(outliers.index)]
updated_final_filtered_df = final_filtered_df[~final_filtered_df.index.isin(outliers.index)]

plt.figure(figsize=(10, 6))
sns.boxplot(x=updated_final_filtered_df[numerical_column])
plt.title(f'Box Plot of {numerical_column} After Removing Outlier')
plt.xlabel(numerical_column)
plt.grid(True)
plt.show()

print(f"Number of strong outliers for user 3342 removed: {outliers.shape[0]}")


What can you say about the users and their purchases ?

In [None]:
# TODO : Check if there are users who have records with zero and positive revenue

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

revenue_column = 'revenue'
users_with_zero_revenue = final_filtered_df[final_filtered_df[revenue_column] == 0]['user'].unique()
users_with_positive_revenue = final_filtered_df[final_filtered_df[revenue_column] > 0]['user'].unique()
users_with_both = set(users_with_zero_revenue).intersection(users_with_positive_revenue)

print(f"Number of users with both zero and positive revenue: {len(users_with_both)}")
print("List of users with both zero and positive revenue:")
print(users_with_both)

Can a user have records with both zero and positive revenue ?

Let's make the assumption that the records are user visits to the service, and the experimental unit is users.


In [None]:
# TODO : Group the data for each user and sum the revenue for each to get one record per user

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

revenue_column = 'revenue'
user_revenue_sum = final_filtered_df.groupby('user')[revenue_column].sum().reset_index()
user_revenue_sum.columns = ['user', 'total_revenue']
print("Aggregated Data (Total Revenue per User):")
print(user_revenue_sum.head())

In [None]:
# TODO : Visualize the resulting distribution of the data again.

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

revenue_column = 'revenue'
user_revenue_sum = final_filtered_df.groupby('user')[revenue_column].sum().reset_index()
user_revenue_sum.columns = ['user', 'total_revenue']

plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(user_revenue_sum['total_revenue'], bins=30, kde=True)
plt.title('Histogram of Total Revenue per User')
plt.xlabel('Total Revenue')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.boxplot(x=user_revenue_sum['total_revenue'])
plt.title('Box Plot of Total Revenue per User')
plt.xlabel('Total Revenue')

plt.tight_layout()
plt.show()

It should be noted that during the transformation, the quantiles in the test group increased

It can be seen that in the test group, almost all quantile statistics, except for the minimum, are at a slightly lower level.

Let's look at various statistics in the context of AB test groups for all users

In [None]:
# TODO : get metrics by groups
# TODO : create two additional metrics to see relative changes
# TODO : add them to the rest of the metrics in a separate group

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

revenue_column = 'revenue'
metrics_by_group = final_filtered_df.groupby('group')[revenue_column].agg(['mean', 'median', 'std', 'count']).reset_index()
metrics_by_group.columns = ['group', 'mean_revenue', 'median_revenue', 'std_revenue', 'count_records']
overall_mean_revenue = metrics_by_group['mean_revenue'].mean()
metrics_by_group['relative_change_from_mean'] = (metrics_by_group['mean_revenue'] - overall_mean_revenue) / overall_mean_revenue
baseline_group = metrics_by_group['group'].iloc[0]
baseline_mean_revenue = metrics_by_group[metrics_by_group['group'] == baseline_group]['mean_revenue'].values[0]

metrics_by_group['percentage_change_from_baseline'] = ((metrics_by_group['mean_revenue'] - baseline_mean_revenue) / baseline_mean_revenue) * 100

print("Combined Metrics by A/B Test Group:")
print(metrics_by_group)

sns.set(style="whitegrid")
plt.figure(figsize=(16, 10))

# Plot Mean Revenue by Group
plt.subplot(3, 2, 1)
sns.barplot(x='group', y='mean_revenue', data=metrics_by_group, palette='viridis')
plt.title('Mean Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Mean Revenue')

# Plot Median Revenue by Group
plt.subplot(3, 2, 2)
sns.barplot(x='group', y='median_revenue', data=metrics_by_group, palette='viridis')
plt.title('Median Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Median Revenue')

# Plot Standard Deviation of Revenue by Group
plt.subplot(3, 2, 3)
sns.barplot(x='group', y='std_revenue', data=metrics_by_group, palette='viridis')
plt.title('Standard Deviation of Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Standard Deviation')

# Plot Count of Records by Group
plt.subplot(3, 2, 4)
sns.barplot(x='group', y='count_records', data=metrics_by_group, palette='viridis')
plt.title('Count of Records by Group')
plt.xlabel('Group')
plt.ylabel('Count of Records')

# Plot Relative Change from Mean
plt.subplot(3, 2, 5)
sns.barplot(x='group', y='relative_change_from_mean', data=metrics_by_group, palette='viridis')
plt.title('Relative Change from Mean Revenue')
plt.xlabel('Group')
plt.ylabel('Relative Change')

# Plot Percentage Change from Baseline
plt.subplot(3, 2, 6)
sns.barplot(x='group', y='percentage_change_from_baseline', data=metrics_by_group, palette='viridis')
plt.title('Percentage Change from Baseline Revenue')
plt.xlabel('Group')
plt.ylabel('Percentage Change')

plt.tight_layout()
plt.show()


What can you see in the test group about the total amount of revenue, the average check per user, and the number of orders per user slightly increased ?

Let's also see how paying users behave :

In [None]:
# TODO : get metrics by groups
# TODO : create two additional metrics to see relative changes
# TODO : add them to the rest of the metrics in a separate group

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]

revenue_column = 'revenue'
basic_metrics_by_group = final_filtered_df.groupby('group')[revenue_column].agg(['mean', 'median', 'std', 'count']).reset_index()
basic_metrics_by_group.columns = ['group', 'mean_revenue', 'median_revenue', 'std_revenue', 'count_records']
overall_mean_revenue = basic_metrics_by_group['mean_revenue'].mean()
basic_metrics_by_group['relative_change_from_mean'] = (basic_metrics_by_group['mean_revenue'] - overall_mean_revenue) / overall_mean_revenue
baseline_group = basic_metrics_by_group['group'].iloc[0]
baseline_mean_revenue = basic_metrics_by_group[basic_metrics_by_group['group'] == baseline_group]['mean_revenue'].values[0]
basic_metrics_by_group['percentage_change_from_baseline'] = ((basic_metrics_by_group['mean_revenue'] - baseline_mean_revenue) / baseline_mean_revenue) * 100
print("Combined Metrics by A/B Test Group:")
print(basic_metrics_by_group)

sns.set(style="whitegrid")
plt.figure(figsize=(16, 12))

# Plot Mean Revenue by Group
plt.subplot(3, 2, 1)
sns.barplot(x='group', y='mean_revenue', data=basic_metrics_by_group, palette='viridis')
plt.title('Mean Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Mean Revenue')

# Plot Median Revenue by Group
plt.subplot(3, 2, 2)
sns.barplot(x='group', y='median_revenue', data=basic_metrics_by_group, palette='viridis')
plt.title('Median Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Median Revenue')

# Plot Standard Deviation of Revenue by Group
plt.subplot(3, 2, 3)
sns.barplot(x='group', y='std_revenue', data=basic_metrics_by_group, palette='viridis')
plt.title('Standard Deviation of Revenue by Group')
plt.xlabel('Group')
plt.ylabel('Standard Deviation')

# Plot Count of Records by Group
plt.subplot(3, 2, 4)
sns.barplot(x='group', y='count_records', data=basic_metrics_by_group, palette='viridis')
plt.title('Count of Records by Group')
plt.xlabel('Group')
plt.ylabel('Count of Records')

# Plot Relative Change from Mean
plt.subplot(3, 2, 5)
sns.barplot(x='group', y='relative_change_from_mean', data=basic_metrics_by_group, palette='viridis')
plt.title('Relative Change from Mean Revenue')
plt.xlabel('Group')
plt.ylabel('Relative Change')

# Plot Percentage Change from Baseline
plt.subplot(3, 2, 6)
sns.barplot(x='group', y='percentage_change_from_baseline', data=basic_metrics_by_group, palette='viridis')
plt.title('Percentage Change from Baseline Revenue')
plt.xlabel('Group')
plt.ylabel('Percentage Change')

plt.tight_layout()
plt.show()

Let's look at the distributions of all and only paying users

In [None]:
f, axes = plt.subplots(2, figsize=(10,8))
# build graphs of distributions of all users
sns.distplot(df.loc[df['VARIANT_NAME'] == 'control', 'REVENUE'], ax = axes[0], label='control')
sns.distplot(df.loc[df['VARIANT_NAME'] == 'variant', 'REVENUE'], ax = axes[0], label='variant')
axes[0].set_title('Distribution of revenue of all users')

# build graphs of distributions of paying users
sns.distplot(df.loc[(df['VARIANT_NAME'] == 'control') & (df['REVENUE'] > 0), 'REVENUE'], ax = axes[1], label='control' )
sns.distplot(df.loc[(df['VARIANT_NAME'] == 'variant') & (df['REVENUE'] > 0), 'REVENUE'], ax = axes[1], label='variant' )
axes[1].set_title('Paying user revenue distribution')
plt.legend()
plt.subplots_adjust(hspace = 0.3)

### Statistical Analysis <a name="Statistical"/>

#### Checking if the distribution is normal

Based on their previous graph, we see that the data is not normally distributed.

In [None]:
# TODO : Use the Shapiro-Wilk criterion to confirm it

import scipy.stats as stats

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_df['group'].unique()[0]
final_filtered_df = filtered_df[filtered_df['group'] == first_ab_test_group]
revenue_column = 'revenue'
revenue_data = final_filtered_df[revenue_column]
stat, p_value = stats.shapiro(revenue_data)

print(f"Shapiro-Wilk Test Statistic: {stat}")
print(f"Shapiro-Wilk p-value: {p_value}")

alpha = 0.05
if p_value > alpha:
    print("The data appears to be normally distributed (fail to reject H0).")
else:
    print("The data does not appear to be normally distributed (reject H0).")

Is the null hypothesis about the normal distribution of the data rejected ?

#### Mann-Whitney test

Let's check the value of the statistics of the Mann-Whitney test. Some sources have a limitation of applicability in case of duplicate data. There are a lot of repetitions in our sample, and especially a lot of zero values, so in this case we need to be careful about this criterion.

In [None]:
(df['REVENUE'] == 0).value_counts()

In [None]:
# TODO : Check on the data of all users the value of the statistics of the Mann-Whitney test

import scipy.stats as stats

unique_groups_per_user = df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_df = df[df['user'].isin(users_with_one_group)]

revenue_column = 'revenue'
groups = filtered_df['group'].unique()

results = []

for i in range(len(groups)):
    for j in range(i + 1, len(groups)):
        group1 = groups[i]
        group2 = groups[j]


        data_group1 = filtered_df[filtered_df['group'] == group1][revenue_column]
        data_group2 = filtered_df[filtered_df['group'] == group2][revenue_column]


        stat, p_value = stats.mannwhitneyu(data_group1, data_group2, alternative='two-sided')

        results.append({
            'Group1': group1,
            'Group2': group2,
            'Mann-Whitney U Statistic': stat,
            'p-value': p_value
        })

results_df = pd.DataFrame(results)
print("Mann-Whitney U Test Results:")
print(results_df)

In [None]:
# TODO : Check on the data of only the users that pay, the value of the statistics of the Mann-Whitney test

import scipy.stats as stats

paying_users_df = df[df['revenue'] > 0]
unique_groups_per_user = paying_users_df.groupby('user')['group'].nunique()
users_with_one_group = unique_groups_per_user[unique_groups_per_user == 1].index
filtered_paying_df = paying_users_df[paying_users_df['user'].isin(users_with_one_group)]
first_ab_test_group = filtered_paying_df['group'].unique()[0]
filtered_paying_df = filtered_paying_df[filtered_paying_df['group'] == first_ab_test_group]
revenue_column = 'revenue'
groups = filtered_paying_df['group'].unique()

results = []

for i in range(len(groups)):
    for j in range(i + 1, len(groups)):
        group1 = groups[i]
        group2 = groups[j]

        data_group1 = filtered_paying_df[filtered_paying_df['group'] == group1][revenue_column]
        data_group2 = filtered_paying_df[filtered_paying_df['group'] == group2][revenue_column]

        stat, p_value = stats.mannwhitneyu(data_group1, data_group2, alternative='two-sided')

        results.append({
            'Group1': group1,
            'Group2': group2,
            'Mann-Whitney U Statistic': stat,
            'p-value': p_value
        })

results_df = pd.DataFrame(results)

print("Mann-Whitney U Test Results for Paying Users:")
print(results_df)

#### Bootstrap

In order to get more complete information about the differences between the average values of the ab test groups, we will use bootstap.

Let's create a function to get back samples and get a confidence interval, and then look at the sample statistics

In [None]:
def get_bootstrap_samples(data, n_samples=1000):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

def stat_intervals(stat, alpha=0.05):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [None]:
# TODO : Generate samples for all users

sample_size = 10
samples = []

grouped = df.groupby('user')

for user, group_data in grouped:
    if len(group_data) > sample_size:
        sampled_data = group_data.sample(n=sample_size, random_state=42)
    else:
        sampled_data = group_data
    samples.append(sampled_data)
sampled_df = pd.concat(samples).reset_index(drop=True)

print("Sampled Data for All Users:")
print(sampled_df.head())

In [None]:
# TODO : Generate samples for only paying users

paying_users_df = df[df['revenue'] > 0]
sample_size = 10
samples = []
grouped = paying_users_df.groupby('user')

for user, group_data in grouped:
    if len(group_data) > sample_size:
        sampled_data = group_data.sample(n=sample_size, random_state=42)
    else:
        sampled_data = group_data
    samples.append(sampled_data)

sampled_paying_df = pd.concat(samples).reset_index(drop=True)

print("Sampled Data for Paying Users:")
print(sampled_paying_df.head())

Let's look at the distribution of means in the ab test groups

In [None]:
f, ax = plt.subplots()
# plt.figure(figsize=(20,5))
sns.kdeplot(np.mean(control, axis=1), shade=True, label='control')
sns.kdeplot(np.mean(variant, axis=1), shade=True, label='variant')
plt.title('Sample mean distribution for all users')

In [None]:
f, ax = plt.subplots()
# plt.figure(figsize=(20,5))
sns.kdeplot(np.mean(control_paid, axis=1), shade=True, label='control')
sns.kdeplot(np.mean(variant_paid, axis=1), shade=True, label='variant')
plt.title('Sample mean distribution for paying users')

Do you see any difference ? What about the confidence intervals ? Conclude.

Let's evaluate the difference between the groups: look at the distribution of the mean difference and build confidence intervals for it. To do this, we will create a function for visualization

In [None]:
def plot_distribution_and_stat_intervals(variant, control, title, alpha=0.05):
    """ Plot the distribution of the mean difference and return the confidence intervals """
    f, ax = plt.subplots()
    # get data for coloring confidence intervals
    points = sns.kdeplot(variant - control, shade=True).get_lines()[0].get_data()
    x = points[0]
    y = points[1]
    ymin, ymax = plt.ylim()
    # highlight the zero value and the bounds of the confidence interval
    plt.vlines(0, 0, ymax, label='0', color='gray')
    plt.vlines(stat_intervals(variant - control, alpha)[0], 0, ymax, linestyles="dashed")
    plt.vlines(stat_intervals(variant - control, alpha)[1], 0, ymax, linestyles="dashed")
    # color the confidence interval and zones outside it
    plt.fill_between(x,y,
                     where = (x >= stat_intervals(variant - control, alpha)[1]),
                     color='gainsboro')
    plt.fill_between(x,y,
                     where = (x <= stat_intervals(variant - control, alpha)[0]),
                     color='gainsboro')
    plt.fill_between(x,y,
                     where = ((x >= stat_intervals(variant - control, alpha)[0])
                              & (x <= stat_intervals(variant - control, alpha)[1])),
                     color='red',
                     label = '95% confidence interval')
    plt.title(f'Distribution of difference between means (variant - control) {title}; {100*(1-alpha)}% Confidence interval for difference of means: {stat_intervals(variant - control, alpha)}')
    plt.legend(prop={'size':13})
    # return confidence interval data
    return stat_intervals(variant - control)

Let's build a graph of the distribution of the difference in the means and get a confidence interval

For all users

In [None]:
plot_distribution_and_stat_intervals(np.mean(variant, axis=1),
                                     np.mean(control, axis=1),
                                     title='all users')

In [None]:
plot_distribution_and_stat_intervals(np.mean(variant_paid, axis=1),
                                     np.mean(control_paid, axis=1),
                                     title='paying users')

What can you observe ? Conclude about the statistical change in average revenue between A/B test groups.
