# AB Test Analysis

In [1]:
import pandas as pd
from scipy import stats
df = pd.read_csv('data.csv')

## Frequestist Statistics

In [2]:
def zTest(X1, sd1, n1, X2, sd2, n2, mudiff):
    from numpy import sqrt, abs, round
    from scipy.stats import norm
    pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)
    z = ((X1 - X2) - mudiff)/pooledSE
    pval = 2*(1 - norm.cdf(abs(z)))
    return round(z, 3), round(pval, 4)

In [3]:
# Insert python code here to find these KPI's.  Feel free to add additional notebook cells as needed.
df['revenue'] = df.iap_net+df.offerwall_net+df.video_net
n_var= df.variant.value_counts()

For AB test analysis, following hypothesis is tested-
Null Hypothesis (H0): Two means are same and there is no significant difference statistically.
Alternative Hypothesis (H1): Two means are significantly different than each other statistically, so mathematical difference holds true.

In [4]:
#Retention
d1 = df.groupby('variant')['s_duration'].mean()/60
s1 = df.groupby('variant')['s_duration'].std()/60
#Ztest = |u1-u2|/(s/sqrt(n))
print(d1)

variant
1D_Cooldown    9.523712
3D_Cooldown    9.405551
Name: s_duration, dtype: float64


### Z-test for Retention

In [5]:
zTest(d1[0],s1[0],n_var[1],d1[1],s1[1],n_var[0], 0)

(1.2829999999999999, 0.19950000000000001)

As shown above p-value > 0.05. So at 95% confidence interval, we can't reject null hypothesis that both means are not significantly different.
Hence, we can infer that no variant is better than one another in player retention, mathematically and statistically.

### t-test for Retention 

In [6]:
stats.ttest_ind_from_stats(d1[0],s1[0],n_var[1],d1[1],s1[1],n_var[0])

Ttest_indResult(statistic=1.2834951290347938, pvalue=0.19932107834723095)

In [7]:
#Monetization
d2 = df.groupby('variant')['revenue'].mean()
s2 = df.groupby('variant')['revenue'].std()
print(d2)

variant
1D_Cooldown    2.414446
3D_Cooldown    2.537845
Name: revenue, dtype: float64


### Z-test for Monetization

In [8]:
zTest(d2[0],s2[0],n_var[1],d2[1],s2[1],n_var[0], 0)

(-6.0339999999999998, 0.0)

As shown above Z_mon < 0.05. So at 95% confidence interval, we can reject null hypothesis and accept alternate hypothesis that two means are significantly different from each other statistically. Hence, we can infer that 3D_Cooldown performs better than 1D_Cooldown statistically and mathematically.

### t-test for Monetization

In [9]:
stats.ttest_ind_from_stats(d2[0],s2[0],n_var[1],d2[1],s2[1],n_var[0])

Ttest_indResult(statistic=-6.0301204613729089, pvalue=1.6430142394875688e-09)

As shown above p-value > 0.05. So at 95% confidence interval, we can't reject null hypothesis that two means for monetization are same and there is no significant difference statistically.
Hence, mathematically 3D_cooldown performs better than 1D_cooldown in terms of monetization, but statistically there isn't enough evidence that 3D_cooldown is better than 1D_cooldown in monetization.

In [10]:
#Engagement
d3 = df.groupby('variant')['days_elapsed'].mean()
s3 = df.groupby('variant')['days_elapsed'].std()
print(d3)

variant
1D_Cooldown    1.189791
3D_Cooldown    1.201319
Name: days_elapsed, dtype: float64


### Z-test for Engagement

In [11]:
zTest(d3[0],s3[0],n_var[1],d3[1],s3[1],n_var[0], 0)

(-1.3460000000000001, 0.1782)

As shown above p-value > 0.05. So at 95% confidence interval, we can't reject null hypothesis that both means are not significantly different. Hence, we can infer that no variant is better than one another in player engagement, mathematically and statistically.

### t-test for Engagement

In [12]:
stats.ttest_ind_from_stats(d3[0],s3[0],n_var[1],d3[1],s3[1],n_var[0])

Ttest_indResult(statistic=-1.3464581785603205, pvalue=0.17815727547503668)

### Final Conclusion-

Comparing above three results and choosing the best one out of two- 3D_cooldown performed better than 1D_cooldown, twice mathematically and once statistically. Hence it can be inferred that 3D_cooldown works better than 1D_cooldown.

## Bayesian Statistics

In [None]:
import pymc3 as pm
import seaborn as sb

### Bayes test for Retention

In [None]:
sample = df.groupby('variant')['s_duration'].sum()/60
n = sample.sum()
obs_r1 = sample[0]
obs_r2 = sample[1]

In [None]:
with pm.Model() as model: # context management
    # define priors
    prior_r1 = pm.Beta('prior_r1', alpha=2, beta=2)
    prior_r2 = pm.Beta('prior_r2', alpha=2, beta=2)
    
    # define likelihood
    like_r1 = pm.Binomial('like_r1', n=n, p=prior_r1, observed=obs_r1)
    like_r2 = pm.Binomial('like_r2', n=n, p=prior_r2, observed=obs_r2)
    
    # inference
    trace = pm.sample(draws=50000, step=pm.Metropolis(), start=pm.find_MAP(), progressbar=False)

In [None]:
_ = pm.traceplot(trace[1000:], grid=True)

### Bayes test for Monetization

In [None]:
sample = df.groupby('variant')['revenue'].sum()/60
n = sample.sum()
obs_m1 = sample[0]
obs_m2 = sample[1]

In [None]:
with pm.Model() as model: # context management
    # define priors
    prior_m1 = pm.Beta('prior_v1', alpha=2, beta=2)
    prior_m2 = pm.Beta('prior_v2', alpha=2, beta=2)
    
    # define likelihood
    like_m1 = pm.Binomial('like_v1', n=n, p=prior_m1, observed=obs_m1)
    like_m2 = pm.Binomial('like_v2', n=n, p=prior_m2, observed=obs_m2)
    
    # inference
    trace = pm.sample(draws=50000, step=pm.Metropolis(), start=pm.find_MAP(), progressbar=False)

In [None]:
_ = pm.traceplot(trace[1000:], grid=True)

### Bayes test for Engagement

In [None]:
sample = df.groupby('variant')['days_elapsed'].sum()/60
n = sample.sum()
obs_e1 = sample[0]
obs_e2 = sample[1]

In [None]:
with pm.Model() as model: # context management
    # define priors
    prior_e1 = pm.Beta('prior_e1', alpha=2, beta=2)
    prior_e2 = pm.Beta('prior_e2', alpha=2, beta=2)
    
    # define likelihood
    like_e1 = pm.Binomial('like_e1', n=n, p=prior_e1, observed=obs_e1)
    like_e2 = pm.Binomial('like_e2', n=n, p=prior_e2, observed=obs_e2)
    
    # inference
    trace = pm.sample(draws=50000, step=pm.Metropolis(), start=pm.find_MAP(), progressbar=False)

In [None]:
_ = pm.traceplot(trace[1000:], grid=True)