## Import libraries + load the data

In [116]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats import proportion as proptests
import numpy as np

In [117]:
df = pd.read_csv("homepage-experiment-data.csv")

## Check the data

In [118]:
df.head()

Unnamed: 0,Day,Control Cookies,Control Downloads,Control Licenses,Experiment Cookies,Experiment Downloads,Experiment Licenses
0,1,1764,246,1,1850,339,3
1,2,1541,234,2,1590,281,2
2,3,1457,240,1,1515,274,1
3,4,1587,224,1,1541,284,2
4,5,1606,253,2,1643,292,3


In [119]:
df.describe()

Unnamed: 0,Day,Control Cookies,Control Downloads,Control Licenses,Experiment Cookies,Experiment Downloads,Experiment Licenses
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0
mean,15.0,1615.551724,260.482759,24.482759,1632.62069,294.758621,25.241379
std,8.514693,116.308268,28.338037,13.873461,113.02636,22.404807,13.76241
min,1.0,1457.0,223.0,1.0,1458.0,256.0,1.0
25%,8.0,1529.0,240.0,12.0,1555.0,279.0,20.0
50%,15.0,1602.0,254.0,30.0,1606.0,290.0,29.0
75%,22.0,1700.0,276.0,34.0,1728.0,300.0,36.0
max,29.0,1822.0,331.0,42.0,1861.0,349.0,44.0


In [120]:
n_obs = (df["Control Cookies"] + df["Experiment Cookies"]).sum()
n_control = df["Control Cookies"].sum()

In [121]:
n_control

np.int64(46851)

In [122]:
p = 0.5
sd = np.sqrt(p*(1-p) * n_obs)
z_score = ((n_control + 0.5) - p * n_obs) / sd
p_values = 2 * stats.norm.cdf(z_score)

In [123]:
p_values

np.float64(0.1074929405013041)

## Check the evaluation metric

Download Rate

In [124]:
df_test = df.copy() 

In [125]:
n_control_cnt = df_test["Control Cookies"].sum()
n_exper_cnt = df_test["Experiment Cookies"].sum()
n_obs_cnt = n_control + n_exper

print(f"Control obs: {n_control_cnt}")
print(f"Exper. obs: {n_exper_cnt}")
print(f"Total obs: {n_obs_cnt}")

Control obs: 46851
Exper. obs: 47346
Total obs: 81189


In [126]:
n_control_downloads = df['Control Downloads'].sum()
n_experiment_downloads = df['Experiment Downloads'].sum()
n_total_downloads = n_control_downloads + n_experiment_downloads

print(f"Control downloads: {n_control_downloads}")
print(f"Exper. downloads: {n_experiment_downloads}")
print(f"Total downloads: {n_total_downloads}")

Control downloads: 7554
Exper. downloads: 8548
Total downloads: 16102


In [127]:
p_control_downloads = n_control_downloads / n_control_cnt
p_exper_downloads = n_experiment_downloads / n_exper_cnt
p_total_downloads = (n_control_downloads + n_experiment_downloads) / (n_control_cnt + n_exper_cnt)

print(f"Control downloads rate: {p_control_downloads}")
print(f"Exper. downloads rate: {p_exper_downloads}")
print(f"Total downloads rate: {p_total_downloads}")

Control downloads rate: 0.16123455209067042
Exper. downloads rate: 0.180543234908968
Total downloads rate: 0.1709396265273841


In [128]:
#calc SE
se_p = np.sqrt(p_total_downloads * (1-p_total_downloads) * (1 / n_control_cnt + 1 / n_exper_cnt))

#calc z score
z_score = (p_exper_downloads - p_control_downloads) / se_p

print(f"Standard Error: {se_p}")
print(f"Z-Score: {z_score}")
print('p value: ', 1 - stats.norm.cdf(z_score))

Standard Error: 0.0024531940948456393
Z-Score: 7.870833726066236
p value:  1.7763568394002505e-15


Licence

In [129]:
df_licences = df[df["Day"]<22]

n_control = df_licences["Control Cookies"].sum()
n_exper = df_licences["Experiment Cookies"].sum()

n_total = n_control + n_exper

print(f"n_control: {n_control}")
print(f"n_experiment: {n_exper}")
print(f"n_total: {n_total}")

n_control: 33758
n_experiment: 34338
n_total: 68096


In [130]:
n_control_licences = df_licences['Control Licenses'].sum()
n_experiment_licences = df_licences['Experiment Licenses'].sum()
n_total_licences = n_control_licences + n_experiment_licences

print(f"Control Licences: {n_control_licences}")
print(f"Exper. Licences: {n_experiment_licences}")
print(f"Total Licences: {n_total_licences}")

Control Licences: 443
Exper. Licences: 456
Total Licences: 899


In [131]:
p_control_licences = n_control_licences / n_control
p_exper_licences = n_experiment_licences / n_exper
p_total_licences = (n_control_licences + n_experiment_licences) / (n_control + n_exper)

print(f"Control downloads rate: {p_control_licences}")
print(f"Exper. downloads rate: {p_exper_licences}")
print(f"Total downloads rate: {p_total_licences}")

Control downloads rate: 0.013122815332661887
Exper. downloads rate: 0.013279748383714835
Total downloads rate: 0.013201950187969925


In [114]:
se_p = np.sqrt(p_total_licences * (1-p_total_licences) * (1/n_control + 1/n_exper))
z_score = (p_exper_licences - p_control_licences) / se_p

p_val = 1- stats.norm.cdf(z_score)

print("z-score: {z}".format(z=z_score))
print("p-value from z-score: {p}".format(p=p_val))

z-score: 0.17938919612010049
p-value from z-score: 0.4288160570177023
