In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ab-test-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2479030%2F4213024%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240331%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240331T132436Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D617d3939cff07398140761ff600a37ff240c40add19c2e6dc32ac5cdc77abe63de131fc50a445677ac4b4c0c415e0b0309c74705f3b1366edb77c2c367ed65dc5966a00c38bd378c0b4983c24f36a64d50c3c77b26460c851c318476943a7a8052c6d426066f76da5c725be139e41b3fad28d4773ea117cf6b8ba373c3099ebf9cd5a2f061ca59a0ed6422984e0442831371db9e70c2df9e1e0c42330da9f314a6f1ce6cf7f4501850a1b10e7d61c4a059d8009b05ac50264960b954c4aa3430a3f623fabfab5c793e57fb28482e12a5854571f7d610bf91c00cbfde476898ad3d242aa004aeb3e708464093db062a7220078786c63f267b9fc91936a9a181c7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Failed to load (likely expired) https://storage.googleapis.com/kaggle-data-sets/2479030/4213024/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240331%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240331T132436Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=617d3939cff07398140761ff600a37ff240c40add19c2e6dc32ac5cdc77abe63de131fc50a445677ac4b4c0c415e0b0309c74705f3b1366edb77c2c367ed65dc5966a00c38bd378c0b4983c24f36a64d50c3c77b26460c851c318476943a7a8052c6d426066f76da5c725be139e41b3fad28d4773ea117cf6b8ba373c3099ebf9cd5a2f061ca59a0ed6422984e0442831371db9e70c2df9e1e0c42330da9f314a6f1ce6cf7f4501850a1b10e7d61c4a059d8009b05ac50264960b954c4aa3430a3f623fabfab5c793e57fb28482e12a5854571f7d610bf91c00cbfde476898ad3d242aa004aeb3e708464093db062a7220078786c63f267b9fc91936a9a181c7 to path /kaggle/input/ab-test-data
Data source import complete.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import statsmodels.stats.api as sms
from scipy.stats import (ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu,
                         pearsonr, spearmanr, kendalltau, f_oneway, kruskal)
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multicomp import MultiComparison
pd.set_option("display.width", 700)
pd.set_option("display.max_columns", None)
df = pd.read_csv("/kaggle/input/ab-test-data/AB_Test_Results.csv")
print(df.head())

# Data Understanding

In [None]:
#TODO : compute the following : DataFrame Information, Columns in DataFrame, Counting Missing Values, Descriptive Statistics, Value Counts for VARIANT_NAME, Counting Unique Users in USER_ID

df.info()
df.columns
df.isnull().sum()
df.describe().T
df["VARIANT_NAME"].value_counts()
df["USER_ID"].nunique()

Hi Charlott I just want to let you know, I could not find the name of the dsta set used from Kaggle and this code was giving me an error message so I could not actuvally view the data. I just tried to write the code to answer the question the best I could.

# Average Revenue by VARIANT_NAME

In [None]:
#TODO

df.groupby("VARIANT_NAME").agg({"REVENUE":"mean"})
df.groupby('VARIANT_NAME')['REVENUE'].mean()


from scipy import stats

# Filter data based on the variant names (assuming you have two variants: 'Variant1' and 'Variant2')
variant1_revenue = df[df['VARIANT_NAME'] == 'Variant1']['REVENUE']
variant2_revenue = df[df['VARIANT_NAME'] == 'Variant2']['REVENUE']

# Perform a two-sample t-test
t_stat, p_value = stats.ttest_ind(variant1_revenue, variant2_revenue)

# Output the results
print(f"T-statistic: {t_stat}, P-value: {p_value}")

What if H0: M1 = M2 ?
What about H1: M1 1= M2 ?

# Assumptions Control:
*# 1- Normality Assumptions*
*# 2- Homogeneity of Variances Assumption*

In [None]:
# TODO : test the normality of revenue data in two groups (variant and control) within your DataFrame df using the Shapiro-Wilk test, what can you conclude ?

from scipy.stats import shapiro

variant_revenue = df[df['VARIANT_NAME'] == 'Variant']['REVENUE']
control_revenue = df[df['VARIANT_NAME'] == 'Control']['REVENUE']
stat_variant, p_variant = shapiro(variant_revenue)
stat_control, p_control = shapiro(control_revenue)

print(f"Variant Group: Shapiro-Wilk Test Statistic={stat_variant}, p-value={p_variant}")
print(f"Control Group: Shapiro-Wilk Test Statistic={stat_control}, p-value={p_control}")

In [None]:
# TODO : test for the homogeneity of variances between two groups (variant and control) within your DataFrame df using the Levene's test,  what can you conclude ?

from scipy.stats import levene

variant_revenue = df[df['VARIANT_NAME'] == 'Variant']['REVENUE']
control_revenue = df[df['VARIANT_NAME'] == 'Control']['REVENUE']
stat, p_value = levene(variant_revenue, control_revenue)

print(f"Levene's Test Statistic={stat}, p-value={p_value}")

# Non-Parametric Test (mannwhitneyu Test)

In [None]:
# TODO : perform the Mann-Whitney U test for comparing the average revenue between the variant and control groups in your DataFrame, what can you conclude ?

from scipy.stats import mannwhitneyu

variant_revenue = df[df['VARIANT_NAME'] == 'Variant']['REVENUE']
control_revenue = df[df['VARIANT_NAME'] == 'Control']['REVENUE']
stat, p_value = mannwhitneyu(variant_revenue, control_revenue, alternative='two-sided')

print(f"Mann-Whitney U Test Statistic={stat}, p-value={p_value}")

In [None]:
# TODO : create a bar plot showing the average revenues by variant names using the seaborn library

import seaborn as sns
import matplotlib.pyplot as plt

sns.barplot(x='VARIANT_NAME', y='REVENUE', data=df)
plt.show()

avg_revenue = df.groupby('VARIANT_NAME')['REVENUE'].mean().reset_index()
plt.figure(figsize=(8, 6))
sns.barplot(x='VARIANT_NAME', y='REVENUE', data=avg_revenue)
plt.title('Average Revenue by Variant Name')
plt.xlabel('Variant Name')
plt.ylabel('Average Revenue')
plt.show()