# A/B Testing

## Test of Means Analysis
Statistical calculation of whether the mean values of the treatment and control groups are the same

## T-Test
Give p-value - likelihood the actual difference between the means is zero.  
p-value < 0.05 (statistically significant)  
T-test is important when working in small data set

There are three types of t-tests you can use:  
1. Paired  
2. Equal variance  
3. Unequal variance  

In a random experiment you will usually assume that variances between the groups are different, so we’ll use an unequal variance t-test.


In [1]:
# load package
import pandas as pd
from scipy.stats import ttest_ind

## Supper Hero Data

In [2]:
super_hero = pd.read_excel('superherodata.xlsx')

super_hero

Unnamed: 0,Name,Alignment,Intelligence,Strength,Speed,Durability,Power,Combat,Unnamed: 8,Hero,Rival,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,Iron Man,good,100.0,85.0,58.0,85.0,100.0,64.0,,Iron Man,Ultron,,,,
1,Captain America,good,63.0,65.0,35.0,56.0,46.0,100.0,,Captain America,Red Skull,,,,
2,Hulk,good,88.0,100.0,47.0,100.0,41.0,85.0,,Hulk,Abomination,,,,
3,Hawkeye,good,62.0,12.0,23.0,14.0,26.0,80.0,,Hawkeye,Bullseye,,,,
4,Thor,good,69.0,100.0,92.0,100.0,100.0,85.0,,Thor,Loki,,,,
5,Spider-Man,good,88.0,55.0,60.0,74.0,58.0,85.0,,Spider-Man,Green Goblin,,,,
6,Average,,78.333333,69.5,52.5,71.5,61.833333,83.166667,,,,,,,
7,,,,,,,,,,,,,,,
8,Ultron,bad,95.0,83.0,42.0,100.0,100.0,64.0,,,,,,,
9,Red Skull,bad,90.0,30.0,12.0,14.0,19.0,80.0,,,,,,,


In [3]:
useful_columns = list(super_hero.columns)[:8]
useful_columns

['Name',
 'Alignment',
 'Intelligence',
 'Strength',
 'Speed',
 'Durability',
 'Power',
 'Combat']

In [4]:
super_hero_clean_data = super_hero[useful_columns].dropna()
super_hero_clean_data

Unnamed: 0,Name,Alignment,Intelligence,Strength,Speed,Durability,Power,Combat
0,Iron Man,good,100.0,85.0,58.0,85.0,100.0,64.0
1,Captain America,good,63.0,65.0,35.0,56.0,46.0,100.0
2,Hulk,good,88.0,100.0,47.0,100.0,41.0,85.0
3,Hawkeye,good,62.0,12.0,23.0,14.0,26.0,80.0
4,Thor,good,69.0,100.0,92.0,100.0,100.0,85.0
5,Spider-Man,good,88.0,55.0,60.0,74.0,58.0,85.0
8,Ultron,bad,95.0,83.0,42.0,100.0,100.0,64.0
9,Red Skull,bad,90.0,30.0,12.0,14.0,19.0,80.0
10,Abomination,bad,85.0,80.0,53.0,90.0,55.0,95.0
11,Bullseye,bad,75.0,11.0,25.0,70.0,20.0,70.0


In [5]:
good = super_hero_clean_data[super_hero_clean_data['Alignment']=='good'].select_dtypes(exclude='O')
good

Unnamed: 0,Intelligence,Strength,Speed,Durability,Power,Combat
0,100.0,85.0,58.0,85.0,100.0,64.0
1,63.0,65.0,35.0,56.0,46.0,100.0
2,88.0,100.0,47.0,100.0,41.0,85.0
3,62.0,12.0,23.0,14.0,26.0,80.0
4,69.0,100.0,92.0,100.0,100.0,85.0
5,88.0,55.0,60.0,74.0,58.0,85.0


In [6]:
bad = super_hero_clean_data[super_hero_clean_data['Alignment']=='bad'].select_dtypes(exclude='O')
bad

Unnamed: 0,Intelligence,Strength,Speed,Durability,Power,Combat
8,95.0,83.0,42.0,100.0,100.0,64.0
9,90.0,30.0,12.0,14.0,19.0,80.0
10,85.0,80.0,53.0,90.0,55.0,95.0
11,75.0,11.0,25.0,70.0,20.0,70.0
12,87.0,57.0,47.0,85.0,85.0,56.0
13,85.0,35.0,35.0,48.0,38.0,28.0


In [7]:
sp = ttest_ind(good, bad, equal_var=False)
print('Statistic:', sp.statistic)
print('pvalue:', sp.pvalue)

Statistic: [-1.12050955  1.11471746  1.45838682  0.19621283  0.47871287  1.69207353]
pvalue: [0.30094185 0.2916016  0.18086733 0.84837569 0.64249665 0.13204121]


## Customer Support Time Study Data

In [8]:
customer_data = pd.read_excel('customersupporttimestudydata.xlsx', header=2)
customer_data

Unnamed: 0,Customer Onboarding Process,Employee 1,Employee 2,Employee 3,Employee 4,Employee 5,Employee 6,Employee 7,Employee 8,Employee 9,Employee 10
0,Identify Key Customer Contacts,1.82,2.89,3.75,1.5,3.59,3.24,1.33,2.48,2.52,3.98
1,Assign Customer Support Resources,0.86,0.44,0.78,0.73,0.51,0.65,0.67,0.67,0.89,0.36
2,Enter Customer Information into Database,1.48,1.98,1.59,1.34,1.03,1.58,1.33,1.86,1.18,1.67
3,Establish Success Metrics,10.9,10.12,9.79,11.53,11.03,9.77,10.37,10.89,10.71,10.26
4,,,,,,,,,,,
5,,,,,,,,,,,
6,,,,,,,,,,,
7,,Joe,,,,,,,,,
8,Customer Onboarding Process,Employee 1,Employee 2,Employee 3,Employee 4,Employee 5,Employee 6,Employee 7,Employee 8,Employee 9,Employee 10
9,Identify Key Customer Contacts,4.44,4.83,2.85,4.34,4.46,3,3.27,4.51,2.2,4.74


In [9]:
customer_data = customer_data.dropna().drop(index=8).drop(columns='Customer Onboarding Process').astype('float')
customer_data

Unnamed: 0,Employee 1,Employee 2,Employee 3,Employee 4,Employee 5,Employee 6,Employee 7,Employee 8,Employee 9,Employee 10
0,1.82,2.89,3.75,1.5,3.59,3.24,1.33,2.48,2.52,3.98
1,0.86,0.44,0.78,0.73,0.51,0.65,0.67,0.67,0.89,0.36
2,1.48,1.98,1.59,1.34,1.03,1.58,1.33,1.86,1.18,1.67
3,10.9,10.12,9.79,11.53,11.03,9.77,10.37,10.89,10.71,10.26
9,4.44,4.83,2.85,4.34,4.46,3.0,3.27,4.51,2.2,4.74
10,0.65,0.59,0.34,0.01,0.64,0.01,0.11,0.14,0.04,0.54
11,2.17,1.79,1.84,1.69,2.02,1.65,1.46,1.65,2.06,1.55
12,12.27,10.2,9.27,9.79,9.89,10.68,12.21,9.91,12.95,10.15


In [10]:
nataly = customer_data[:4].sum().values
joe = customer_data[4:].sum().values
joe

array([19.53, 17.41, 14.3 , 15.83, 17.01, 15.34, 17.05, 16.21, 17.25,
       16.98])

In [12]:
sp = ttest_ind(joe, nataly, equal_var=False)
print('Statistic:', sp.statistic)
print('pvalue:', sp.pvalue)

Statistic: 2.5569719803766966
pvalue: 0.02312427188943783


In [None]:
# Randomized Design Tests
data = pd.read_csv('grocerywebsiteabtestdata.csv')
data.head()

In [None]:
data.info()