In [29]:
import pandas as pd

In [30]:
desktop = pd.read_csv('desktop.csv')
laptop = pd.read_csv('laptop.csv')

In [31]:
desktop.head(3)

Unnamed: 0,userid,spending,age,visits
0,1,1250,31,126
1,2,900,27,5
2,3,0,30,459


In [32]:
laptop.head(3)

Unnamed: 0,userid,spending,age,visits
0,31,1499,32,12
1,32,799,23,40
2,33,1200,45,22


In [33]:
from scipy.stats import ttest_ind

In [34]:
ttest_ind(desktop['spending'],laptop['spending'])

TtestResult(statistic=-2.109853741030508, pvalue=0.03919630411621095, df=58.0)

In [35]:
ttest_ind(desktop['age'],laptop['age'])

TtestResult(statistic=-0.7101437106800108, pvalue=0.4804606394128761, df=58.0)

In [36]:
ttest_ind(desktop['visits'],laptop['visits'])

TtestResult(statistic=0.20626752311535543, pvalue=0.8373043059847984, df=58.0)

## Running experiments to test new hypothesis

In [37]:
import numpy as np

In [38]:
# split the desktop sample into 2 groups by the median age of customers
medianage=np.median(desktop['age'])
groupa=desktop.loc[desktop['age']<=medianage,:]
groupb=desktop.loc[desktop['age']>medianage,:]

In [39]:
emailresults1 = pd.read_csv('emailresults1.csv')

In [40]:
emailresults1.head(3)

Unnamed: 0,userid,revenue
0,1,100
1,2,0
2,3,50


In [41]:
groupa_withrevenue=groupa.merge(emailresults1, on='userid')
groupb_withrevenue=groupb.merge(emailresults1, on='userid')

In [42]:
ttest_ind(groupa_withrevenue['revenue'], groupb_withrevenue['revenue'])

TtestResult(statistic=-2.186454851070545, pvalue=0.03730073920038287, df=28.0)

In [43]:
print(np.mean(groupb_withrevenue['revenue'])-np.mean(groupa_withrevenue['revenue']))

125.0


In [44]:
print(np.mean(groupa_withrevenue['revenue']))

104.0


In [45]:
print(np.mean(groupb_withrevenue['revenue']))

229.0


## Translating math into practice

In [46]:
np.random.seed(18811015)
laptop.loc[:,'groupassignment1']=1*(np.random.random(len(laptop.index))>0.5)
groupc=laptop.loc[laptop['groupassignment1']==0,:].copy()
groupd=laptop.loc[laptop['groupassignment1']==1,:].copy()

In [47]:
groupc.head(3)

Unnamed: 0,userid,spending,age,visits,groupassignment1
0,31,1499,32,12,0
2,33,1200,45,22,0
4,35,1350,17,85,0


In [48]:
groupd.head(3)

Unnamed: 0,userid,spending,age,visits,groupassignment1
1,32,799,23,40,1
3,34,0,59,126,1
6,37,3400,65,428,1


In [50]:
emailresults2 = pd.read_csv('emailresults2.csv')

In [51]:
emailresults2.head(3)

Unnamed: 0,userid,revenue
0,31,100
1,32,0
2,33,50


In [52]:
groupc_withrevenue=groupc.merge(emailresults2, on='userid')
groupd_withrevenue=groupd.merge(emailresults2, on='userid')

In [53]:
print(ttest_ind(groupc_withrevenue['revenue'], groupd_withrevenue['revenue']))

TtestResult(statistic=-2.381320497676198, pvalue=0.024288828555138562, df=28.0)


In [54]:
print(np.mean(groupd_withrevenue['revenue']-np.mean(groupc_withrevenue['revenue'])))

260.3333333333333


## Understanding Effect Sizes

In [61]:
gdps=[365303000000,65994000000,220000000]

In [62]:
np.std(gdps)

158884197328.32672

In [63]:
125/np.std(gdps)

7.867365169217765e-10

In [64]:
burgers=[9.0,12.99,10.50]

In [65]:
np.std(burgers)

1.6455394252341695

In [66]:
125/np.std(burgers)

75.96293232671214

## Calculating the Significance of Data

In [67]:
from statsmodels.stats.power import TTestIndPower

In [70]:
# calculate the power for the A/B test
# power is the probability that a correctly run A/B test will reject a false null hypothesis
# power should be 80% or higher to proceed
analysis = TTestIndPower()
alpha = 0.05
nobs=45
effectsize=0.5
power = analysis.solve_power(effectsize, nobs1=nobs, alpha=alpha)

In [69]:
power

0.6501855020289931

In [72]:
# calculate the number of observations needed to get to an 80% power
analysis = TTestIndPower()
alpha=0.05
power=0.8
effectsize=0.5
observations = analysis.solve_power(effect_size=effectsize, power=power, alpha=alpha)

In [73]:
print(observations)

63.765611775409525
