# A/B Testing Project

# 0. IMPORTS

In [30]:
import math
import pandas as pd
import numpy  as np

from statsmodels.stats import api as sms
from scipy.stats       import chi2_contingency

# 1. LOADING DATA

In [2]:
df_raw = pd.read_csv('../data/raw/ab_data.csv')

In [3]:
df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
df_raw.shape

(294478, 5)

# 2. DESIGN OF EXPERIMENT

## 2.1 - Hyphotesis Formulation

In [None]:
# H0: The new page conversion is yet 13%
# H1: The new page conversion is different from 13%

## 2.2 - Parameters of the experiment

In [5]:
# To define the confidence level
confidence_level = 0.95

# To define the significance level: 1 - confidence_level
significance_level = 1 - confidence_level

# conversion rate of the old page and new page
p1 = 0.13
p2 = 0.15

# To define the effect size
effect_size = sms.proportion_effectsize( p1, p2 )

# To define the statistics power (Standard is around 80%)
power = 0.8

# To calculate the sample size: What is the minimun sample size to attend the above requirements? 
min_sample_size = sms.NormalIndPower().solve_power(
                     effect_size,
                     power=power,
                     alpha=significance_level
)

min_sample_size = math.ceil(min_sample_size)

print('The minimum sample size required is: {}'.format( 2*min_sample_size) )
print('The minimum sample size required for the control group is: {}'.format( min_sample_size) )
print('The minimum sample size required for the treatment group is: {}'.format( min_sample_size) )


The minimum sample size required is: 9440
The minimum sample size required for the control group is: 4720
The minimum sample size required for the treatment group is: 4720


# 3.0 - EXPLORATORY DATA ANALYSIS

In [None]:
n_sample = 4720

In [None]:
# We should be aware that to get 10.000 answers it is necessary to calculate the number of people that should be contacted. It is not everyone that will
# open an email or even reply the research, for example.

Size of the dataset:

In [6]:
print('Number of rows: {}'.format( df_raw.shape[0] ) )
print('Number of columns: {}'.format( df_raw.shape[1] ) )

Number of rows: 294478
Number of columns: 5


Is there NaN Values? ( NaN is an acronimum for is not a number)

In [7]:
df_raw.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

Checking flags of A/B Test

In [8]:
df_raw[['group','landing_page','user_id']].groupby(['group','landing_page']).count().reset_index()

Unnamed: 0,group,landing_page,user_id
0,control,new_page,1928
1,control,old_page,145274
2,treatment,new_page,145311
3,treatment,old_page,1965


There are some users that got access to the two pages:
- users classified as "control" has to had seen just the old page
- users classified as "treatment" has to had seen just the new page

To fix it:
- check the volum of duplicated values and if the volum is small, remove it.

In [9]:
df_duplicated = df_raw[['group','landing_page','user_id']].groupby(['user_id']).count().reset_index().query('group > 1')['user_id']
df1 = df_raw[~df_raw['user_id'].isin(df_duplicated)]
df1.shape

(286690, 5)

## 3.1 - Treatment and Control datasets creation

In [11]:
n_sample = 4720

In [18]:
# Control dataset creation: as previously seen, the size is 4720 samples
df_control = df1[df1['group'] == 'control'].sample( n=n_sample, random_state=42 )
print('The size of control dataset is: {}'.format(df_control.shape[0] ) )

# Treatment dataset creation: as previously seen, the size is 4720 samples
df_treatment = df1[df1['group'] == 'treatment'].sample( n=n_sample, random_state=42 )
print('The size of treatment dataset is: {}'.format(df_treatment.shape[0] ) )


# Concat both dataset
df_abtest = pd.concat( [df_control, df_treatment]).reset_index(drop=True)

The size of control dataset is: 4720
The size of treatment dataset is: 4720


## 3.2 Calculus of metric between groups

In [19]:
# ------------ CONTROL ------------
sales = df_control.loc[df_control['converted'] == 1, 'converted'].sum()
visit = len(df_control)

conversion_rate_c = sales / visit
print('Conversion rate - Control group: {:.4f}'.format( conversion_rate_c ) )

# ------------ TREATMENT------------
sales = df_treatment.loc[df_treatment['converted'] == 1, 'converted'].sum()
visit = len(df_treatment)

conversion_rate_c = sales / visit
print('Conversion rate - Treatment group: {:.4f}'.format( conversion_rate_c ) )

# ------------ CONVERSION = BUYERS / SELLERS
df_ab_table = df_abtest[['group','converted']].groupby( 'group' ).agg( {'converted': ['sum', 'count'] })
df_ab_table.columns = ['converted', 'non_converted']

# Hypothesis Testing
chi_val, pval, dof, expected = chi2_contingency( df_ab_table )

print( 'p-value: {:.2f}'.format( pval ))

if pval < significance_level:
    print( 'Reject H0')
else:
    print( 'Do not reject H0')

Conversion rate - Control group: 0.1155
Conversion rate - Treatment group: 0.1290
p-value: 0.08
Do not reject H0


In [20]:
df_ab_table

Unnamed: 0_level_0,converted,non_converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,545,4720
treatment,609,4720


# 4.0 - Conversao da pagina em faturamento

In [21]:
df4 = df1.copy()

In [None]:
pagina_atual = 13
pagina_nova = 15

In [None]:
n de visitantes diarios * 0.13 
gmv = compradores * 4500

In [24]:
# Tratando a coluna data
df4['timestamp'] = pd.to_datetime( df4['timestamp'] ).apply( lambda x: x.strftime( '%Y-%m-%d' ) )

In [34]:
df5 = df4[['user_id', 'timestamp']].groupby( 'timestamp' ).count().reset_index()

# Current GMV (Gross Margin Revenue)
df5['current_purchases'] = np.ceil( df5['user_id'] * 0.13 ).astype( int )
df5['current_gmv'] = df5['current_purchases'] * 4500

current_gmv = df5['current_gmv'].sum()
print( 'GMV On Period: {}'.format( current_gmv ))

# Expected GMV
df5['new_purchases'] = np.ceil( df5['user_id'] * 0.15 ).astype( int )
df5['new_gmv'] = df5['new_purchases'] * 4500

new_gmv = df5['new_gmv'].sum()
print( 'New GMV On Period: {}'.format( new_gmv ))

lift = 100*( new_gmv - current_gmv ) / current_gmv
print( 'Expected Lift: {:.2f}%'.format( lift ))

GMV On Period: 167760000
New GMV On Period: 193563000
Expected Lift: 15.38%
