In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from scipy.stats import lognorm, kstest
from pathlib import Path
from ab_testing.constants import DISTRIBUTIONS, client, target_col

from fitter import Fitter, get_distributions, get_common_distributions

df = pd.read_parquet(Path("raw_data") / f"{client}_data.p")
data = df[df[target_col] > 0]

sns.set_style('white')
sns.set_context("paper", font_scale = 2)
sns.displot(data=data, x=target_col, kind="hist", bins = 100, aspect = 1.5)

In [None]:
df = pd.DataFrame(columns=["distribution", "AIC", "BIC"])
dists = []
aic = []
bic = []
for com_dist in get_common_distributions():
    dist = eval("scipy.stats." + com_dist)
    params = dist.fit(data["total_wins_spend"].values)
    pdf_fitted = dist.pdf(data["total_wins_spend"].values, *params)
    
    logLik = np.sum(dist.logpdf(data["total_wins_spend"].values, *params))
    k = len(params[:])
    n = len(data["total_wins_spend"].values)
    dists.append(com_dist)
    aic.append(2 * k - 2 * logLik)
    bic.append(k * np.log(n) - 2 * logLik)

df["distribution"] = dists
df["AIC"] = aic
df["BIC"] = bic
df.sort_values(by="AIC", inplace=True)
df.reset_index(inplace=True, drop=True)
df

In [None]:
x = np.arange(len(data))
y = data[target_col].values
h = plt.hist(y, bins=range(48))

for dist_name in DISTRIBUTIONS:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(y)
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * len(data)
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * len(data)
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0, np.quantile(data[target_col], 0.99))
plt.legend(loc='upper right')
plt.show()

In [None]:
print("Exponential distribution:")
print(scipy.stats.kstest(data[target_col].values, 'expon'))

sigma, loc, scale = lognorm.fit(data[target_col].values, floc=0)

mu = np.log(scale)

print("Log normal distribution:")
print("mu    = %9.5f" % mu)
print("sigma = %9.5f" % sigma)

stat, p = kstest(data[target_col].values, 'lognorm', args=(sigma, 0, scale), alternative='two-sided')
print("KS Test:")
print("stat    = %9.5f" % stat)
print("p-value = %9.5f" % p)