## Evaluating distribution fit for 30 days of personalised spend data for each client:

#### Set Up:

In [1]:
from typing import List, Dict, Union, Literal
from pathlib import Path

import pandas as pd
import numpy as np
from scipy import stats

from ab_testing.data_acquisition.acquire_data import queries_dict

from ml_lib.feature_store import configure_offline_feature_store
from ml_lib.feature_store.offline.client import FeatureStoreOfflineClient

configure_offline_feature_store(workgroup="primary")



In [2]:
ScipyDists = Union[
    stats._continuous_distns.expon_gen,
    stats._continuous_distns.lognorm_gen,
    stats._continuous_distns.norm_gen,
    stats._continuous_distns.gamma_gen,
    stats._continuous_distns.dweibull_gen,
]
DISTRIBUTIONS: Dict[str, ScipyDists] = {
    "expon": stats.expon,
    "lognorm": stats.lognorm,
    "norm": stats.norm,
    "gamma": stats.gamma,
    "dweibull": stats.dweibull,
}


class FitDistribution:
    def __init__(self, fname: str, data_dir_str: str = "processed_data"):
        self.data_dir_path = Path(data_dir_str)
        self.fname = fname

        data_dir_path = Path(data_dir_str)
        if not data_dir_path.exists():
            data_dir_path.mkdir(parents=True, exist_ok=True)

    def fit(self, data: pd.DataFrame, target: str) -> str:
        df = pd.DataFrame(columns=["distribution", "AIC", "BIC", "n", "k"])
        dists: List[str] = []
        aic: List[float] = []
        bic: List[float] = []
        n_list: List[int] = []
        k_list: List[int] = []
        for dist_name, dist in DISTRIBUTIONS.items():
            params = dist.fit(data[target].values)
            logLik = np.sum(dist.logpdf(data[target].values, *params))
            k, n = len(params), len(data)
            dists.append(dist_name)
            aic.append(2 * k - 2 * logLik)
            bic.append(k * np.log(n) - 2 * logLik)
            n_list.append(n)
            k_list.append(k)

        df["distribution"] = dists
        df["AIC"] = aic
        df["BIC"] = bic
        df["n"] = n_list
        df["k"] = k_list
        df.sort_values(by="AIC", inplace=True)
        df.reset_index(drop=True, inplace=True)

        df.to_parquet(self.data_dir_path / self.fname)

        return df

#### Distribution Fit:

In [4]:
client_map = [
    "bingo_aloha",
    "homw",
    "idle_mafia",
    "spongebob",
    "terra_genesis",
    "ultimex",
]
result = "total_wins_spend"

for i in range(0, 6):
    client_name = client_map[i]
    spend_type = 0
    client_name_small = client_name + "_small"
    initial_data2 = FeatureStoreOfflineClient.run_athena_query_pandas(
        queries_dict[client_name_small],
        {
            "strt_date": "2022-11-05",
            "end_date": "2022-12-05",
            "strt_fl": "2020-01-01",
            "end_fl": "2022-12-04",
            "spend_type": spend_type,
        },
    )
    fit_dist = FitDistribution(fname=f"{client_name}_distribution_fit.p")
    best_distribution = fit_dist.fit(
        initial_data2.loc[initial_data2[result] > 0], result
    )

    print("Distribution fit for:", client_name)
    print(best_distribution)

DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:133.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:3032.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:43799433|c


Distribution fit for: bingo_aloha
  distribution            AIC            BIC      n  k
0      lognorm  420578.340922  420605.395481  60973  3
1        gamma  425984.766608  426011.821167  60973  3
2     dweibull  434836.531039  434863.585598  60973  3
3        expon  435776.016888  435794.053260  60973  2
4         norm  544168.663806  544186.700179  60973  2


DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:117.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:2551.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:12557744|c


Distribution fit for: homw
  distribution            AIC            BIC      n  k
0     dweibull   48877.607884   48900.296786  14228  3
1      lognorm   94875.581809   94898.270710  14228  3
2        gamma   99923.174973   99945.863875  14228  3
3        expon  110234.223775  110249.349709  14228  2
4         norm  145132.993895  145148.119829  14228  2


DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:146.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:3617.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:38160962|c


Distribution fit for: idle_mafia
  distribution           AIC           BIC      n  k
0        gamma -27213.984213 -27192.227464  10428  3
1      lognorm   1361.427840   1383.184589  10428  3
2     dweibull  50104.430759  50126.187509  10428  3
3        expon  69564.771213  69579.275713  10428  2
4         norm  90874.638076  90889.142575  10428  2


DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:121.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:2709.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:42779126|c


Distribution fit for: spongebob
  distribution            AIC            BIC     n  k
0     dweibull    6150.272167    6170.186149  5642  3
1      lognorm   35256.993068   35276.907050  5642  3
2        expon   40859.137009   40872.412997  5642  2
3         norm   61482.262966   61495.538954  5642  2
4        gamma  111139.436471  111159.350453  5642  3


DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:114.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:4749.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:29973229|c


Distribution fit for: terra_genesis
  distribution           AIC           BIC     n  k
0      lognorm  17436.961942  17454.511083  2565  3
1        gamma  17730.151049  17747.700190  2565  3
2        expon  18008.961441  18020.660868  2565  2
3     dweibull  18362.896544  18380.445685  2565  3
4         norm  23380.120687  23391.820114  2565  2


DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:140.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:2667.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:12557744|c


Distribution fit for: ultimex
  distribution            AIC            BIC      n  k
0     dweibull   48877.607884   48900.296786  14228  3
1      lognorm   94875.581809   94898.270710  14228  3
2        gamma   99923.174973   99945.863875  14228  3
3        expon  110234.223775  110249.349709  14228  2
4         norm  145132.993895  145148.119829  14228  2
