In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, beta, lognorm, norm, invgamma

In [5]:
df = pd.read_csv('ab_data.csv')

In [6]:
df.sample(20)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
135173,918691,2017-01-07 18:49:26.579183,treatment,new_page,0
54671,921263,2017-01-08 00:55:04.657552,control,old_page,0
239298,838994,2017-01-12 13:44:08.943994,treatment,new_page,0
231675,731889,2017-01-18 01:27:08.845685,treatment,new_page,0
148057,764318,2017-01-10 00:50:13.801442,treatment,new_page,0
27307,934834,2017-01-17 12:56:19.793523,control,old_page,0
191167,861558,2017-01-20 04:17:31.972816,control,old_page,0
93108,743874,2017-01-12 07:24:39.780380,treatment,new_page,0
217384,764323,2017-01-03 13:56:36.521688,control,old_page,0
153550,805150,2017-01-09 17:45:06.965068,control,old_page,0


In [44]:
for column in df.columns:
    print(column)
    print(df[column].unique())

user_id
[851104 804228 661590 ... 734608 697314 715931]
timestamp
['2017-01-21 22:11:48.556739' '2017-01-12 08:01:45.159739'
 '2017-01-11 16:55:06.154213' ... '2017-01-22 11:45:03.439544'
 '2017-01-15 01:20:28.957438' '2017-01-16 12:40:24.467417']
group
['control' 'treatment']
landing_page
['old_page' 'new_page']
converted
[0 1]
week
[3 2 1 4]


In [7]:
df_control_converted = df[(df['group'] == 'control') & (df['converted'] == 1)]

In [8]:
df_treatment_converted = df[(df['group'] == 'treatment') & (df['converted'] == 1)]

In [9]:
df_treatment_unconverted = df[(df['group'] == 'treatment') & (df['converted'] == 0)]

In [10]:
df_control_unconverted = df[(df['group'] == 'control') & (df['converted'] == 0)]

In [11]:
len(df_treatment_converted)/len(df_treatment_unconverted)

0.13497017616867804

In [12]:
len(df_control_converted) / len(df_control_unconverted)

0.1368793395067926

In [13]:
(len(df_treatment_converted)/len(df_treatment_unconverted)) - (len(df_control_converted) / len(df_control_unconverted))

-0.0019091633381145556

In [48]:
counter = df['user_id'].value_counts()
#counter

851104    1
701718    1
646178    1
715222    1
886963    1
         ..
683486    1
671693    1
875919    1
701344    1
715931    1
Name: user_id, Length: 286690, dtype: int64

In [15]:
valid_users = pd.DataFrame(counter[counter == 1].index, columns=['user_id'])
df =df.merge(valid_users, on='user_id')

In [47]:
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted,week
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,3
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,2
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,2
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,1
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,3
...,...,...,...,...,...,...
286685,751197,2017-01-03 22:28:38.630509,control,old_page,0,1
286686,945152,2017-01-12 00:51:57.078372,control,old_page,0,2
286687,734608,2017-01-22 11:45:03.439544,control,old_page,0,3
286688,697314,2017-01-15 01:20:28.957438,control,old_page,0,2


In [49]:
import datetime
df['week'] = df['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').isocalendar().week)

In [51]:
start_time = datetime.datetime.strptime(df['timestamp'].min(), '%Y-%m-%d %H:%M:%S.%f')
end_time = datetime.datetime.strptime(df['timestamp'].max(), '%Y-%m-%d %H:%M:%S.%f')
data_duration = (end_time - start_time).days
#print(start_time, end_time)

2017-01-02 13:42:05.378582 2017-01-24 13:41:54.460509


In [20]:
df[df['group'] == 'control'].shape[0]*100/ df.shape[0]

49.98186194146988

In [21]:
df['week'].value_counts()

2    91380
3    91056
1    83745
4    20509
Name: week, dtype: int64

In [22]:
control = df[df['group'] == 'control']

In [23]:
treatment = df[df['group'] == 'treatment']

In [24]:
control_conversion = control['converted'].sum()*100/control['converted'].count()

In [25]:
control_conversion 

12.017335110577628

In [26]:
treatment_conversion = treatment['converted'].sum()*100/treatment['converted'].count()

In [27]:
control_conversion - treatment_conversion

0.14470179188895216

In [28]:
treatment_converted = treatment['converted'].sum()
treatment_unconverted = treatment['converted'].count() - treatment['converted'].sum()
control_converted = control['converted'].sum() 
control_unconverted = control['converted'].count() - control['converted'].sum()
print(treatment_converted, control_converted, treatment_unconverted, control_unconverted)


17025 17220 126372 126073


In [29]:
contingency_table = np.array([[control_converted, control_unconverted],[treatment_converted, treatment_unconverted]])

In [30]:
chi2, p_value, _, _ = chi2_contingency(contingency_table, correction=False)

In [31]:
contingency_table

array([[ 17220, 126073],
       [ 17025, 126372]], dtype=int64)

In [32]:
chi2, p_value

(1.426794609399621, 0.23228827305833816)

In [33]:
prior_data = df[(df['week'] == 3) & (df['group']=='control')]

In [34]:
prior_means = []
for i in range(1000):
    prior_means.append(prior_data.sample(1000)['converted'].mean())

In [35]:
prior_means

[0.11,
 0.113,
 0.124,
 0.103,
 0.131,
 0.122,
 0.122,
 0.136,
 0.138,
 0.125,
 0.113,
 0.111,
 0.13,
 0.131,
 0.131,
 0.12,
 0.136,
 0.128,
 0.13,
 0.118,
 0.102,
 0.141,
 0.15,
 0.129,
 0.145,
 0.148,
 0.113,
 0.134,
 0.118,
 0.136,
 0.118,
 0.102,
 0.124,
 0.127,
 0.108,
 0.118,
 0.13,
 0.119,
 0.135,
 0.12,
 0.128,
 0.127,
 0.121,
 0.116,
 0.106,
 0.126,
 0.111,
 0.132,
 0.12,
 0.123,
 0.115,
 0.141,
 0.112,
 0.125,
 0.116,
 0.124,
 0.112,
 0.122,
 0.138,
 0.12,
 0.123,
 0.119,
 0.118,
 0.138,
 0.126,
 0.114,
 0.119,
 0.123,
 0.126,
 0.127,
 0.127,
 0.123,
 0.127,
 0.122,
 0.141,
 0.111,
 0.106,
 0.106,
 0.114,
 0.111,
 0.106,
 0.127,
 0.118,
 0.134,
 0.117,
 0.12,
 0.126,
 0.104,
 0.106,
 0.121,
 0.117,
 0.118,
 0.123,
 0.121,
 0.134,
 0.123,
 0.129,
 0.129,
 0.14,
 0.137,
 0.135,
 0.108,
 0.124,
 0.123,
 0.097,
 0.112,
 0.116,
 0.128,
 0.127,
 0.131,
 0.13,
 0.113,
 0.125,
 0.136,
 0.127,
 0.126,
 0.126,
 0.119,
 0.134,
 0.132,
 0.12,
 0.123,
 0.117,
 0.119,
 0.106,
 0.127,
 0.11

In [36]:
alpha, beta1, _, _ = beta.fit(prior_means, floc=0, fscale=1)


In [37]:
number_of_weeks = 4
experiment_data = df[df['week'] < number_of_weeks]
control = experiment_data[experiment_data['group'] == 'control']
treatment = experiment_data[experiment_data['group'] == 'treatment']

In [38]:
posterior_control = beta(alpha + control_converted, beta1 + control_unconverted)
posterior_treatment = beta(alpha + treatment_converted, beta1 + treatment_unconverted)


In [39]:
posterior_control
posterior_treatment

<scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x27b75582b80>

In [40]:
control_samples = posterior_control.rvs(1000)
treatment_samples = posterior_treatment.rvs(1000)

In [41]:
prob = np.mean(control_samples > treatment_samples)
prob

0.896

In [42]:
increase_percentage = (treatment_samples - control_samples)/ control_samples
print(f'probability of seeing a 1 percent increase is  {np.mean((increase_percentage * 100) > 1)* 100}%')

probability of seeing a 1 percent increase is  1.4000000000000001%


In [43]:
increase_percentage

array([-1.46626705e-02, -1.42333885e-02, -3.15491072e-02, -1.11868877e-02,
        3.07502211e-03, -2.51777632e-03, -1.04874282e-02, -1.22649266e-02,
        4.65761602e-03, -1.06323559e-02,  2.81615983e-03, -5.42461694e-03,
       -1.44424767e-02, -1.17775274e-02, -6.43720125e-04, -4.49430729e-03,
       -6.27489407e-03, -1.83787122e-02, -8.46312444e-04, -2.60595497e-02,
       -1.65413875e-02, -2.47299520e-02, -2.53495885e-02, -1.10429178e-02,
       -1.16256780e-02, -1.09634816e-02, -1.21335432e-02, -1.95550637e-02,
       -1.07341494e-02,  1.21384920e-02, -1.60487319e-02, -9.73863974e-03,
       -1.95163953e-02, -8.91569824e-03, -1.26928598e-03,  1.11229656e-03,
       -2.43397830e-02, -6.63116148e-03, -1.73177111e-02, -3.76084747e-03,
       -1.81129796e-02, -2.16965514e-02,  5.16467594e-03, -2.62921553e-02,
       -2.36284874e-03, -6.88244618e-03,  1.52447056e-03, -2.48793940e-02,
       -1.38842557e-02, -2.13104895e-02, -2.92348567e-03, -1.94977572e-02,
       -5.10188570e-03, -