In [1]:
import numpy as np
import pandas as pd
import math as mt
from scipy.stats import norm

In [2]:
baseline = {"Cookies":40000,"Clicks":3200,"Enrollments":660,"CTP":0.08,"GConversion":0.20625, "Retention":0.53,"NConversion":0.109313}

In [3]:
baseline["Cookies"] = 5000
baseline["Clicks"] = baseline["Clicks"]*(5000/40000)
baseline["Enrollments"] = baseline["Enrollments"]*(5000/40000)
baseline

{'Cookies': 5000,
 'Clicks': 400.0,
 'Enrollments': 82.5,
 'CTP': 0.08,
 'GConversion': 0.20625,
 'Retention': 0.53,
 'NConversion': 0.109313}

In [4]:
# Gross Conversion is the number of user ids to complete checkout and enroll in the free trial divided by number of unique cookies to click the "Start free trial" button.
GC={}
GC["d"] = 0.01
GC["p"] = baseline["GConversion"]
GC["n"] = baseline["Clicks"]
GC["standard_deviation"] = round(mt.sqrt(GC["p"]*(1-GC["p"])/GC["n"]),4)
GC

{'d': 0.01, 'p': 0.20625, 'n': 400.0, 'standard_deviation': 0.0202}

In [5]:
# Retention is the number of user ids to remain enrolled past the 14-day boundary (and thus make at least one payment) divided by number of user ids to complete checkout.
R={}
R["d"] = 0.01
R["p"] = baseline["Retention"]
R["n"] = baseline["Enrollments"]
R["standard_deviation"] = round(mt.sqrt(R["p"]*(1-R["p"])/R["n"]),4)
R

{'d': 0.01, 'p': 0.53, 'n': 82.5, 'standard_deviation': 0.0549}

In [6]:
# Net conversion is the number of user ids to remain enrolled past the 14 day boundary (and thus make at least one payment) divided by the number of unique cookies to click the "Start free trial" button.
NC={}
NC["d"] = 0.0075
NC["p"] = baseline["NConversion"]
NC["n"] = baseline["Clicks"]
NC["standard_deviation"] = round(mt.sqrt(NC["p"]*(1-NC["p"])/NC["n"]),4)
NC

{'d': 0.0075, 'p': 0.109313, 'n': 400.0, 'standard_deviation': 0.0156}

In [7]:
def get_z_score(alpha):
    return norm.ppf(alpha)

def get_sds(p,d):
    sd1=mt.sqrt(2*p*(1-p))
    sd2=mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
    sds=[sd1,sd2]
    return sds

def get_sampSize(sds,alpha,beta,d):
    n=pow((get_z_score(1-alpha/2)*sds[0]+get_z_score(1-beta)*sds[1]),2)/pow(d,2)
    return n

In [8]:
GC["SampSize"]=round(get_sampSize(get_sds(GC["p"],GC["d"]),0.05,0.2,GC["d"]))
GC["SampSize"]
# enrollments per group

25835

In [9]:
GC["SampSize"]=round(GC["SampSize"]/0.08*2) # clicks per pageview
GC["SampSize"]
# pageviews

645875

In [10]:
R["SampSize"]=round(get_sampSize(get_sds(R["p"],R["d"]),0.05,0.2,R["d"]))
R["SampSize"]
# enrollments per group

39087

In [11]:
R["SampSize"]=round(R["SampSize"]*2/(660/40000)) # enrollments per pageview
R["SampSize"]
# 39087 enrolled per group then convert to page views

4737818

In [12]:
NC["SampSize"]=round(get_sampSize(get_sds(NC["p"],NC["d"]),0.05,0.2,NC["d"]))
NC["SampSize"]
# enrollments per group

27413

In [13]:
NC["SampSize"]=round(NC["SampSize"]/0.08*2)
NC["SampSize"]
# pageviews

685325

In [14]:
4737818 / 40000 # too long so we drop retention.

118.44545

In [15]:
685325 / 40000 # the highest sample size will be the effective size.

17.133125

In [16]:
df_control = pd.read_csv("/content/control_data.csv")
df_experiment = pd.read_csv("/content/experiment_data.csv")
df_control.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0


In [17]:
results = {"Control":pd.Series([df_control["Pageviews"].sum(), df_control["Clicks"].sum(), df_control["Enrollments"].sum(), df_control["Payments"].sum()], index=["Cookies","Clicks","Enrollments","Payments"]),
           "Experiment":pd.Series([df_experiment["Pageviews"].sum(), df_experiment["Clicks"].sum(), df_experiment["Enrollments"].sum(), df_experiment["Payments"].sum()], index=["Cookies","Clicks","Enrollments","Payments"])}
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Control,Experiment
Cookies,345543.0,344660.0
Clicks,28378.0,28325.0
Enrollments,3785.0,3423.0
Payments,2033.0,1945.0


In [18]:
df_results['Total'] = df_results.Control + df_results.Experiment
df_results['Prob'] = 0.5
df_results['StdErr'] = np.sqrt(df_results.Prob *(1-df_results.Prob)/df_results.Total)
df_results['MargErr'] = abs(get_z_score(0.05/2)) * df_results.StdErr
df_results['CI_lower'] = df_results.Prob - df_results.MargErr
df_results['CI_upper'] = df_results.Prob + df_results.MargErr
df_results['Obs_val'] = df_results.Experiment / df_results.Total
df_results['Pass_Sanity'] = df_results.apply(lambda x: (x.Obs_val> x.CI_lower) and (x.Obs_val<x.CI_upper),axis=1)
df_results['Diff'] = abs((df_results.Experiment - df_results.Control)/df_results.Total)
df_results

Unnamed: 0,Control,Experiment,Total,Prob,StdErr,MargErr,CI_lower,CI_upper,Obs_val,Pass_Sanity,Diff
Cookies,345543.0,344660.0,690203.0,0.5,0.000602,0.00118,0.49882,0.50118,0.49936,True,0.001279
Clicks,28378.0,28325.0,56703.0,0.5,0.0021,0.004115,0.495885,0.504115,0.499533,True,0.000935
Enrollments,3785.0,3423.0,7208.0,0.5,0.005889,0.011543,0.488457,0.511543,0.474889,False,0.050222
Payments,2033.0,1945.0,3978.0,0.5,0.007928,0.015538,0.484462,0.515538,0.488939,True,0.022122


In [19]:
control_cookies=df_results.loc['Cookies','Control']
control_clicks=df_results.loc['Clicks','Control']
exp_cookies = df_results.loc['Cookies','Experiment']
exp_clicks = df_results.loc['Clicks','Experiment']
d_hat = round(exp_clicks/exp_cookies - control_clicks/control_cookies, 4)
p_pool = (control_clicks+exp_clicks)/(exp_cookies+control_cookies)
SE_ClickProb = np.sqrt((p_pool*(1-p_pool))*((1/control_cookies)+(1/exp_cookies)))
ME_ClickProb = SE_ClickProb*abs(get_z_score(1-(0.05/2)))
upper_CI = d_hat + ME_ClickProb
lower_CI = d_hat - ME_ClickProb
print(lower_CI, d_hat, upper_CI)

-0.001195655390242568 0.0001 0.001395655390242568


In [20]:
df_control_notnull = df_control[pd.isnull(df_control.Enrollments)!= True]
df_experiment_notnull = df_experiment[pd.isnull(df_experiment.Enrollments)!= True]

In [21]:
results_notnull = {"Control": pd.Series([df_control_notnull.Pageviews.sum(), df_control_notnull.Clicks.sum(),
                                         df_control_notnull.Enrollments.sum(), df_control_notnull.Payments.sum()],
                                        index = ["cookies", "clicks", "enrollments", "payments"]),
                   "Experiment": pd.Series([df_experiment_notnull.Pageviews.sum(), df_experiment_notnull.Clicks.sum(),
                                            df_experiment_notnull.Enrollments.sum(), df_experiment_notnull.Payments.sum()],
                                           index = ["cookies", "clicks", "enrollments", "payments"])}

df_results_notnull = pd.DataFrame(results_notnull)
df_results_notnull

Unnamed: 0,Control,Experiment
cookies,212163.0,211362.0
clicks,17293.0,17260.0
enrollments,3785.0,3423.0
payments,2033.0,1945.0


In [22]:
df_results_notnull["Total"] = df_results_notnull["Control"] + df_results_notnull["Experiment"]
df_results_notnull

Unnamed: 0,Control,Experiment,Total
cookies,212163.0,211362.0,423525.0
clicks,17293.0,17260.0,34553.0
enrollments,3785.0,3423.0,7208.0
payments,2033.0,1945.0,3978.0


In [23]:
enrollments_exp = df_results_notnull.loc["enrollments"].Experiment
clicks_exp = df_results_notnull.loc["clicks"].Experiment
payments_exp = df_results_notnull.loc["payments"].Experiment

enrollments_cont = df_results_notnull.loc["enrollments"].Control
clicks_cont = df_results_notnull.loc["clicks"].Control
payments_cont = df_results_notnull.loc["payments"].Control

Grossconversion_exp = enrollments_exp/clicks_exp
Netconversion_exp = payments_exp/clicks_exp
Grossconversion_cont = enrollments_cont/clicks_cont
Netconversion_cont = payments_cont/clicks_cont

Grossconversion_p_pool = (enrollments_exp + enrollments_cont) / (clicks_exp + clicks_cont)
Netconversion_p_pool = (payments_exp + payments_cont)/ (clicks_exp + clicks_cont)

Grossconversion_d_hat = Grossconversion_exp - Grossconversion_cont
Netconversion_d_hat = Netconversion_exp - Netconversion_cont

In [24]:
GC_SE = np.sqrt((Grossconversion_p_pool*(1-Grossconversion_p_pool))*((1/clicks_exp)+(1/clicks_cont)))
GC_ME = abs(get_z_score(1-(0.05/2))) * GC_SE
GC_CI_lower = Grossconversion_d_hat - GC_ME
GC_CI_upper = Grossconversion_d_hat + GC_ME
print(GC_CI_lower, Grossconversion_d_hat, GC_CI_upper)

-0.02912320088750467 -0.020554874580361565 -0.011986548273218463


In [25]:
NC_SE = np.sqrt((Netconversion_p_pool*(1-Netconversion_p_pool))*((1/clicks_exp)+(1/clicks_cont)))
NC_ME = abs(get_z_score(1-(0.05/2))) * NC_SE
NC_CI_lower = Netconversion_d_hat - NC_ME
NC_CI_upper = Netconversion_d_hat + NC_ME
print(NC_CI_lower, Netconversion_d_hat, NC_CI_upper)

-0.011604500677993734 -0.0048737226745441675 0.0018570553289053993


In [34]:
df_signtest = pd.merge(df_control_notnull, df_experiment_notnull, on="Date")
df_signtest["Grossconversion_cont"] = df_signtest.Enrollments_x / df_signtest.Clicks_x
df_signtest["Grossconversion_exp"] = df_signtest.Enrollments_y / df_signtest.Clicks_y
df_signtest["Netconversion_cont"] = df_signtest.Payments_x / df_signtest.Clicks_x
df_signtest["Netconversion_exp"] = df_signtest.Payments_y / df_signtest.Clicks_y
cols = ['Date', 'Grossconversion_cont', 'Grossconversion_exp', 'Netconversion_cont', 'Netconversion_exp']

In [35]:
df_signtest = df_signtest[cols]

In [36]:
df_signtest.head()

Unnamed: 0,Date,Grossconversion_cont,Grossconversion_exp,Netconversion_cont,Netconversion_exp
0,"Sat, Oct 11",0.195051,0.153061,0.101892,0.049563
1,"Sun, Oct 12",0.188703,0.147771,0.089859,0.115924
2,"Mon, Oct 13",0.183718,0.164027,0.10451,0.089367
3,"Tue, Oct 14",0.186603,0.166868,0.125598,0.111245
4,"Wed, Oct 15",0.194743,0.168269,0.076464,0.112981


In [37]:
df_signtest['GC_sign'] = df_signtest['Grossconversion_exp'] - df_signtest['Grossconversion_cont']
df_signtest['NC_sign'] = df_signtest['Netconversion_exp'] - df_signtest['Netconversion_cont']

In [53]:
n=len(df_signtest)

In [56]:
GC_x=len(df_signtest[df_signtest['GC_sign']>0])
NC_x=len(df_signtest[df_signtest['NC_sign']>0])

In [57]:
def get_prob(x,n):
  p=round(mt.factorial(n)/(mt.factorial(x)*mt.factorial(n-x))*0.5**x*0.5**(n-x),4)
  return p

def get_2side_pvalue(x,n):
  p=0
  for i in range(0,x+1):
    p = p+get_prob(i,n)
  return 2*p

In [59]:
print(get_2side_pvalue(GC_x,n))
print(get_2side_pvalue(NC_x,n))

0.0026000000000000003
0.6774
