# 0.0 Imports

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib        import pyplot as plt
from statsmodels.stats import api    as sms
from scipy.stats       import chi2_contingency

## 0.1 Load Data

In [4]:
df_raw = pd.read_csv('../data/raw/ab_data.csv', low_memory=False)

df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


# 1.0 Test Parameters

In [None]:
# H0: A conversão da nova página é de 13%
# H1: A conversão da nova página é diferente de 13%

In [5]:
# nivel de confiança
confidence_level = 0.95

# nivel de significancia
significance_level = 0.05

# conversoes da pagina atual e da nova pagina
p1 = 0.13
p2 = 0.15

# tamanho do efeito
effect_size = sms.proportion_effectsize(p1, p2)

# poder estatístico
power = 0.80

# sample size
sample_n = sms.NormalIndPower().solve_power(effect_size=effect_size, power=power, alpha=significance_level)

sample_n = np.round(sample_n).astype(int)

In [6]:
conversion_rate = 0.10
x = sample_n / conversion_rate

print(f'O numero total da base de email para enviar: {2*int(x)}')
print(f'O numero de email para enviar: {int(x)}')

O numero total da base de email para enviar: 94380
O numero de email para enviar: 47190


In [7]:
print(f'O tamanho total da amostra: {2*sample_n}')
print(f'O tamanho da amostra do grupo controle é de: {sample_n}')
print(f'O tamanho da amostra do grupo de tratamento é de: {sample_n}')

O tamanho total da amostra: 9438
O tamanho da amostra do grupo controle é de: 4719
O tamanho da amostra do grupo de tratamento é de: 4719


# 2.0 Data Description

In [8]:
df1 = df_raw.copy()
print(f'Number of rows: {df1.shape[0]}')
print(f'Number of columns: {df1.shape[1]}')

Number of rows: 294478
Number of columns: 5


## 2.1 Check flags of A/B

In [9]:
df1[['user_id', 'group', 'landing_page']].groupby(['group', 'landing_page']).count().reset_index()

Unnamed: 0,group,landing_page,user_id
0,control,new_page,1928
1,control,old_page,145274
2,treatment,new_page,145311
3,treatment,old_page,1965


In [10]:
# deletando usuários que estiveram nos dois grupos
df_user_delete = df1[['user_id', 'group']].groupby('user_id').count().reset_index().query('group > 1')['user_id']

df1 = df1[~df1['user_id'].isin(df_user_delete)]
df1.shape

(286690, 5)

## 2.2 Sample of the groups A/B

In [11]:
# Control group
df1_control_sample = df1[df1['group'] == 'control'].sample(n=sample_n, random_state=42)
print(f'Size of Control Group: {df1_control_sample.shape[0]}')

# Treatment group
df1_treatment_sample = df1[df1['group'] == 'treatment'].sample(n=sample_n, random_state=42)
print(f'Size of Treatment Group: {df1_treatment_sample.shape[0]}')

# Total sample size
df1_ab = pd.concat([df1_control_sample, df1_treatment_sample]).reset_index(drop=True)

Size of Control Group: 4719
Size of Treatment Group: 4719


## 2.3 Calculation of interest metrics between groups

In [12]:
# Control group
sales = df1_control_sample.loc[df1_control_sample['converted'] == 1, 'converted'].sum()
visit = len(df1_control_sample)

conversion_rate_control = sales / visit
print(f'Conversion Rate - Control Group: {conversion_rate_control}')


# Treatment group
sales = df1_treatment_sample.loc[df1_treatment_sample['converted'] == 1, 'converted'].sum()
visit = len(df1_treatment_sample)

conversion_rate_control = sales / visit
print(f'Conversion Rate - Treatment Group: {conversion_rate_control}')

Conversion Rate - Control Group: 0.11549057003602459
Conversion Rate - Treatment Group: 0.1290527654164018


# 3.0 Experiment Design

In [18]:
# H0: A conversão da nova página é de 13%
# H1: A conversão da nova página é diferente de 13%

# Teste de Hipótese
df_ab = pd.concat([df1_control_sample, df1_treatment_sample])

df_table = df_ab[['group', 'converted']].groupby('group').agg({'converted': ['sum', 'count']})
df_table.columns = ['converted', 'not_converted']

chi_val, pval, dof, expected = chi2_contingency(df_table)

print(f'p-value: {pval}')

# Conclusão
if pval < significance_level:
    print('Rejeita a hipótese nula')
else:
    print('Falha em rejeitar a hipótese nula')

p-value: 0.08059550419194315
Falha em rejeitar a hipótese nula


In [21]:
# Conversão de resultado para R$
# pagina_atual = 13%
# pagina_nova = 15%

# compradores = num * 0.13
# gmv = compradores * 4500

df2 = df1.copy()

df2['timestamp'] = pd.to_datetime(df2['timestamp']).apply(lambda x: x.strftime('%Y-%m-%d'))

df2_aux = df2[['user_id', 'timestamp']].groupby('timestamp').count().reset_index()

# Current GMV
df2_aux['current_purchases'] = df2_aux['user_id'] * 0.13
df2_aux['current_gmv'] = df2_aux['current_purchases'] * 4500

current_gmv = df2_aux['current_gmv'].sum()
print(f'GMV on period: {current_gmv}')

# Expect GMV
df2_aux['new_purchases'] = df2_aux['user_id'] * 0.15
df2_aux['new_gmv'] = df2_aux['new_purchases'] * 4500

new_gmv = df2_aux['new_gmv'].sum()
print(f'New GMV on period: {new_gmv}')

lift = 100*(new_gmv - current_gmv) / current_gmv
print(f'Expected lift: {lift:.2f} %')

GMV on period: 167713650.0
New GMV on period: 193515750.0
Expected lift: 15.38 %
