In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
warnings.filterwarnings('ignore')

paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        x = os.path.join(dirname, filename)
        paths.append(x)

df = pd.read_csv(paths[1])

# 1. Cleaning and Organising

# 1. 1. Quick Look at the Data

In [2]:
df.head(5)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [3]:
df.shape

(294480, 5)

In [4]:
df.describe()

Unnamed: 0,user_id,converted
count,294480.0,294480.0
mean,787973.538896,0.119658
std,91210.917091,0.324562
min,630000.0,0.0
25%,709031.75,0.0
50%,787932.5,0.0
75%,866911.25,0.0
max,945999.0,1.0


In [5]:
#Check for NA Values - Print list of columns and number of nan values
df_columns_mask= df.isna().any(axis=0)
columns = df.columns[df_columns_mask]
for col in columns:
    print(f"Column {col} has {new_df[col].isna().sum()} NaN values")

# There are no NA values in the dataset.

In [6]:
# removed duplicates
print(df.shape)
df = df.drop_duplicates(subset= 'user_id', keep= False)
print(df.shape)

(294480, 5)
(286690, 5)


# There are no more duplicate values.

# 1. 2. Group Data by Landing Page and Groups

In [7]:
#count observations for each landing page
grouped = df.groupby(['landing_page', 'group']).agg({'landing_page': lambda x: x.value_counts()})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,landing_page
landing_page,group,Unnamed: 2_level_1
new_page,treatment,143397
old_page,control,143293


In [8]:
grouped = df.groupby(['landing_page','group']).agg({'converted':'sum'})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,converted
landing_page,group,Unnamed: 2_level_1
new_page,treatment,17025
old_page,control,17220


In [9]:
grouped = df.groupby(['landing_page','group']).agg({'landing_page': lambda x: x.value_counts()})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,landing_page
landing_page,group,Unnamed: 2_level_1
new_page,treatment,143397
old_page,control,143293


In [10]:
pd.DataFrame(df.loc[:,'landing_page'].value_counts(normalize = True) * 100)

Unnamed: 0_level_0,proportion
landing_page,Unnamed: 1_level_1
new_page,50.018138
old_page,49.981862


In [11]:
grouped = df.groupby(['group','landing_page']).agg({'converted': 'mean'})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,converted
group,landing_page,Unnamed: 2_level_1
control,old_page,0.120173
treatment,new_page,0.118726


# Old page and new page have conversion rate close to 12%


# 2. Testing

#  H0: There is no significant difference between the two sample distributions.
    The conversion rates are identical.
#  H1: There is a significant difference between the two sample distributions.
    The new landing page has a better conversion rate.

# 2.1 Using Power Analysis to Compare Two Means

In [12]:
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.power import NormalIndPower

#parameters
# p1 is the minimum detectable effect we want( 1% conversion increase)
p1 = 0.13
p2 = 0.12
#power is the probability of safely rejecting the null
power = 0.8
#alpha is the probability of falsely rejecting the null
alpha = 0.05

In [13]:
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportions_ztest

# We need to calculate effect size (a measure of magnitude) to determin sample size
# using Cohen's H Formula standardised formula.
effect_size = (p1 - p2) / ((p1 * (1 - p1) + p2 * (1 - p2)) / 2) ** 0.5
# Calculate the required sample size
analysis = NormalIndPower()
sample_size = analysis.solve_power(effect_size=effect_size, power=power, alpha=alpha, ratio=1.0, alternative='two-sided')
print(f'Effect size: {effect_size:.2f}')

print(f'Required sample size per group: {sample_size:.2f}')

print(f'Required sample size per group: {sample_size:.2f}')

Effect size: 0.03
Required sample size per group: 17165.46
Required sample size per group: 17165.46


In [14]:
# Set the random seed for reproducibility
np.random.seed(45)

# Remove duplicate user_ids and sample 17165 observations per group
df = (df.groupby(['group'])
         .apply(lambda x: x.sample(n=17165, replace=False))
         .reset_index(drop=True))

In [15]:
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,807689,47:50.8,control,old_page,0
1,817401,00:36.8,control,old_page,1
2,652424,28:57.7,control,old_page,0
3,912117,41:07.2,control,old_page,0
4,671687,03:04.9,control,old_page,0
...,...,...,...,...,...
34325,724420,47:05.8,treatment,new_page,0
34326,646390,07:56.9,treatment,new_page,0
34327,838051,35:19.1,treatment,new_page,0
34328,702806,07:35.5,treatment,new_page,0


In [16]:
# Collect data to get total observations and total converted per group
df = (df.groupby('group')
       .agg(total_observations=('user_id', 'size'),
            total_converted=('converted', 'sum'))
       .reset_index())

# Extract counts
conv = df['total_converted'].values
n = df['total_observations'].values

# Conducts Z-Test to compare the mean of the two samples and see if they differ significantly
z_stat, p_value = proportions_ztest(count=conv, nobs=n)

In [17]:
print("Z-statistic:", z_stat)
print("P-value:", p_value)

Z-statistic: -0.43399652860477944
P-value: 0.664290961882086


# The P-Value has a value of 0.66, so we can't reject the null hypothesis.
 There is no significant difference between the two pages.

# 3. Conlusion

There is no significant difference between the two landing pages in terms of conversion rates. We should keep the original landing page until further improvements on the new landing page shows significant improvement in terms of conversion rates.