In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = pd.read_csv('effect_tb.csv', header=None)
data.columns = ['dt', 'user_id', 'label', 'dmp_id']
data.head()

Unnamed: 0,dt,user_id,label,dmp_id
0,1,1,0,1
1,1,1000004,0,1
2,1,1000004,0,2
3,1,1000006,0,1
4,1,1000006,0,3


In [3]:
data.drop('dt', axis=1, inplace=True)

In [4]:
data.describe()

Unnamed: 0,user_id,label,dmp_id
count,2645958.0,2645958.0,2645958.0
mean,3112995.0,0.01456297,1.395761
std,1828262.0,0.1197952,0.692048
min,1.0,0.0,1.0
25%,1526772.0,0.0,1.0
50%,3062184.0,0.0,1.0
75%,4721132.0,0.0,2.0
max,6265402.0,1.0,3.0


In [5]:
data.shape

(2645958, 3)

In [6]:
data.nunique()

user_id    2410683
label            2
dmp_id           3
dtype: int64

In [7]:
data[data.duplicated(keep=False)].sort_values(by='user_id')

Unnamed: 0,user_id,label,dmp_id
8529,1027,0,1
1485546,1027,0,1
1579415,1471,0,1
127827,1471,0,1
404862,2468,0,1
1779459,2468,0,1
666354,3538,0,1
2002511,3538,0,1
694759,3645,0,1
2022683,3645,0,1


In [8]:
data = data.drop_duplicates()
data[data.duplicated(keep=False)]

Unnamed: 0,user_id,label,dmp_id


In [9]:
data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2632975 entries, 0 to 2645957
Data columns (total 3 columns):
user_id    2632975 non-null int64
label      2632975 non-null int64
dmp_id     2632975 non-null int64
dtypes: int64(3)
memory usage: 80.4 MB


In [10]:
data.pivot_table(index='dmp_id', columns='label', values='user_id', aggfunc='count', margins=True)

label,0,1,All
dmp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1881745,23918,1905663
2,404811,6296,411107
3,307923,8282,316205
All,2594479,38496,2632975


In [11]:
data.dtypes

user_id    int64
label      int64
dmp_id     int64
dtype: object

In [12]:
# sample size calculator
# https://www.evanmiller.org/ab-testing/sample-size.html

In [13]:
data.loc[data['dmp_id'] == 1, 'label'].mean()

0.012551012429794775

In [14]:
data['dmp_id'].value_counts()

1    1905663
2     411107
3     316205
Name: dmp_id, dtype: int64

In [29]:
rc = data.loc[data['dmp_id'] == 1, 'label'].mean()
r1 = data.loc[data['dmp_id'] == 2, 'label'].mean()
r2 = data.loc[data['dmp_id'] == 3, 'label'].mean()
print('control:{:.2%}'.format(rc))
print('plan 1:{:.2%}, raised {:.2%}'.format(r1, r1 - rc))
print('plan 2:{:.2%}, raised {:.2%}'.format(r2, r2 - rc))

control:1.26%
plan 1:1.53%, raised 0.28%
plan 2:2.62%, raised 1.36%


In [31]:
# only plan 2 satisfies 1% raise
# hypothesis testing
# test 1 -> H0: rc >= r1, H1: rc < r1
# test 2 -> H0: rc >= r2, H1: rc < r2

In [32]:
# method 1
nc = sum(data['dmp_id'] == 1)
n1 = sum(data['dmp_id'] == 2)
n2 = sum(data['dmp_id'] == 3)

ncc = sum(data.loc[data['dmp_id'] == 1, 'label'])
nc1 = sum(data.loc[data['dmp_id'] == 2, 'label'])
nc2 = sum(data.loc[data['dmp_id'] == 3, 'label'])

rtest1 = (ncc + nc1) / (nc + n1)
rtest2 = (ncc + nc2) / (nc + n2)

print(rtest1, rtest2)

0.013041432684297536 0.014492310074225832


In [33]:
z1 = (rc - r1) / np.sqrt(rtest1 * (1 - rtest1) * (1 / nc + 1 / n1))
z2 = (rc - r2) / np.sqrt(rtest2 * (1 - rtest2) * (1 / nc + 1 / n2))
print(z1, z2)

-14.165873564308429 -59.44168632985996


In [34]:
# one-sided test
from scipy.stats import norm
z_alpha = norm.ppf(0.05)
z_alpha

-1.6448536269514729

In [37]:
# method 2
import statsmodels.stats.proportion as sp
z_score_1, p1 = sp.proportions_ztest([ncc, nc1], [nc, n1], alternative='smaller')
z_score_2, p2 = sp.proportions_ztest([ncc, nc2], [nc, n2], alternative='smaller')
print('test 1:', z_score_1, p1)
print('test 2:', z_score_2, p2)

test 1: -14.165873564308429 7.450121742737582e-46
test 2: -59.44168632985996 0.0


In [35]:
# z1 < z_alpha and z2 < z_zlpha, both test 1 and test 2 we reject null hypothesis

In [None]:
# plan 2 is better