In [None]:
import numpy as np
import pandas as pd
import math
from scipy.stats import norm
from statsmodels.stats.power import TTestIndPower
from scipy.stats import ttest_ind, shapiro
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scikit_posthocs import posthoc_tukey
from statannotations.Annotator import Annotator
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
df_main = pd.read_csv("Server/static/pages/pagelite_main.csv", index_col=0)
df_login = pd.read_csv("Server/static/pages/pagelite_login.csv", index_col=0)
df_pricing = pd.read_csv("Server/static/pages/pagelite_pricing.csv", index_col=0)
df_main

Unnamed: 0,id,group,clicks,lifetime,country,browser,date
1,1,3,8,42.0,China,Chrome,2019-08-25 15:01:46
2,2,6,6,24.0,China,Opera,2019-07-06 00:42:09
3,3,2,9,36.0,India,Chrome,2019-02-21 18:22:14
4,4,2,6,33.0,India,Opera,2019-09-01 23:18:51
5,5,9,7,27.0,India,Chrome,2019-12-14 13:12:22
...,...,...,...,...,...,...,...
1217,1217,7,7,36.0,Spain,Chrome,2019-01-17 21:17:34
1218,1218,5,12,27.0,Germany,Chrome,2019-03-07 20:08:57
1219,1219,4,10,30.0,Germany,Mozilla,2019-03-22 15:49:56
1220,1220,7,11,21.0,United Kingdom,Chrome,2019-04-02 10:40:43


In [14]:
def Data_Prep(dataframe, group, target):
    
    dataframe[group] = dataframe[group].apply(str)
    
    def is_outlier(s):
        Q1 = s.quantile(0.25)
        Q3 = s.quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        return ~s.between(lower_limit, upper_limit)
    
    # Remove outliers
    dataframe = dataframe[~dataframe.groupby(group)[target].apply(is_outlier)]
    dataframe.reset_index(drop=True, inplace=True)
    
    return dataframe

In [16]:
def Multivar_AB_Testing(dataframe, page_name, group, target):
    
    dataframe = Data_Prep(dataframe, group, target)
    
    # extract unique groups
    groups = []
    for g in dataframe[group].unique():
        df = dataframe[dataframe[group]==g]
        groups.append(df[target].values)
    
    # perform one-way ANOVA
    anova = stats.f_oneway(*groups)
    
    if anova[1] < 0.05: # if p<0.05, then at least one of the mean values is significantly different
        
        # perform Tukey's test
        tukey = pairwise_tukeyhsd(endog=dataframe[target],
                                  groups=dataframe[group],
                                  alpha=0.05)
        df_tukey = pd.DataFrame(data=tukey._results_table.data[1:], 
                                columns=tukey._results_table.data[0])
        
        # fetch the info about the groups and merge
        df_info = pd.read_csv("Server/static/pages_info_legacy.csv", usecols=["page.version", "group.codes"])
        df_info = df_info[df_info['page.version'].str.contains(page_name.lower())]
        df_info['page.version.num'] = df_info['page.version'].str.split('.').str[1]
        df = df_tukey.merge(df_info, left_on="group1", right_on="page.version.num").merge(df_info, left_on="group2", right_on="page.version.num")
        df = df[["group1", "group.codes_x", "group2", "group.codes_y", "meandiff", "lower", "upper", "p-adj", "reject"]]
        df = df.rename({'group.codes_x': 'group1.codes', 'group.codes_y': 'group2.codes'}, axis=1)
        
        df1 = dataframe.groupby('group', as_index=False)[target].mean()
        df2 = dataframe.groupby('group', as_index=False).size()
        df3 = dataframe.groupby('group', as_index=False)[target].std()
        df['size.group1'] = df['size.group1'].astype(float)
        df['size.group2'] = df['size.group2'].astype(float)
        
        df = (df.merge(df1, left_on="group1", right_on="group")
               .merge(df1, left_on="group2", right_on="group")
               .merge(df2, left_on='group1', right_on="group")
               .merge(df2, left_on='group2', right_on="group")
               .merge(df3, left_on='group1', right_on="group")
               .merge(df3, left_on='group2', right_on="group"))
        
        df = df.drop(['group_x', 'group_y', 'group_x', 'group_y', 'group_x', 'group_y'], axis=1)
        df.columns = ['group1', 'group1.codes', 'group2', 'group2.codes', 'meandiff', 'lower', 'upper',
               'p-adj', 'reject', 'mean.group1', 'mean.group2', 'size.group1', 'size.group2',
               'std.group1', 'std.group2']
        sd_pooled = math.sqrt((df['mean.group1'].std()**2 + df['mean.group2'].std()**2)/2)
        
        df['effect_size'] = df.meandiff / sd_pooled
        df['power_size'] = np.round(norm.cdf(-1.96 + df['effect_size'] / 
                             np.sqrt(df['std.group1']**2 / df['size.group1'] + 
                                       df['std.group2']**2 / df['size.group2'])), 5)
        df = df[["group1", "group1.codes", "group2", "group2.codes",
                 "meandiff", "lower", "upper", "p-adj", "reject", "effect_size", "power_size"]]
        
    else:  # if p>0.05
        return "There is no significant difference across the means"

    return df

In [17]:
Multivar_AB_Testing(df_main, "main", "group", "clicks").head()

TypeError: loop of ufunc does not support argument 0 of type str which has no callable sqrt method