In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from fitter import Fitter, get_distributions, get_common_distributions
from pathlib import Path

df = pd.read_parquet(Path("raw_data") / "bingo_aloha_data.p")
data=df[df["total_wins_spend"] > 0]

sns.set_style('white')
sns.set_context("paper", font_scale = 2)
sns.displot(data=data, x="total_wins_spend", kind="hist", bins = 100, aspect = 1.5)

In [None]:
method = "aic"
distributions= get_common_distributions()

f = Fitter(data["total_wins_spend"].values,
           distributions = distributions, timeout=300)
f.fit()
summary_df = f.summary(Nbest=len(distributions), plot=False, method=method)
summary_df

In [None]:
print(f.get_best(method = method))
print(f.fitted_param["expon"])

In [None]:
summary_df["rank"] = range(1, len(summary_df) + 1)
summary_df["rank"].loc[summary_df.index == "expon"].values[0]


In [None]:
import scipy.stats as ss
import matplotlib.pyplot as plt

def plot_exponential(x, mu=0, sigma=1, cdf=False):
    if cdf:
        y = ss.expon.cdf(x, mu, sigma)
    else:
        y = ss.expon.pdf(x, mu, sigma)
    plt.plot(x, y)

x = np.linspace(0, data["total_wins_spend"].values.max(), 1000)

plot_exponential(x, f.fitted_param["expon"][0], f.fitted_param["expon"][1])


In [None]:
from scipy.stats import kstest

print(kstest(data["total_wins_spend"].values, 'expon'))

In [None]:
import numpy as np
from scipy.stats import lognorm, kstest


x = data["total_wins_spend"].values


sigma, loc, scale = lognorm.fit(x, floc=0)

mu = np.log(scale)

print("mu    = %9.5f" % mu)
print("sigma = %9.5f" % sigma)

stat, p = kstest(x, 'lognorm', args=(sigma, 0, scale), alternative='two-sided')
print("KS Test:")
print("stat    = %9.5f" % stat)
print("p-value = %9.5f" % p)

In [None]:
import scipy.stats

df = pd.DataFrame(columns=["distribution", "AIC", "BIC"])
dists = []
aic = []
bic = []
for com_dist in get_common_distributions():
    dist = eval("scipy.stats." + com_dist)
    params = dist.fit(data["total_wins_spend"].values)
    pdf_fitted = dist.pdf(data["total_wins_spend"].values, *params)
    
    logLik = np.sum(dist.logpdf(data["total_wins_spend"].values, *params))
    k = len(params[:])
    n = len(data["total_wins_spend"].values)
    dists.append(com_dist)
    aic.append(2 * k - 2 * logLik)
    bic.append(k * np.log(n) - 2 * logLik)

df["distribution"] = dists
df["AIC"] = aic
df["BIC"] = bic
df.sort_values(by="AIC", inplace=True)
df

In [None]:
for com_dist in get_common_distributions():    
    try:
        ks_result = kstest(data["total_wins_spend"].values, com_dist)
        print(f"For {com_dist} it is {ks_result}")
    except: 
        print(f"Not able to do KS test for {com_dist}")