# Imports

In [77]:
import json

from drift_calculator import DriftCalculator
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import warnings
from typing import Union, Optional


In [78]:
stat_tests = [
    'ks', # <= 1000 Kolmogorov–Smirnov
    'wasserstein', # > 1000 Wasserstein distance (normed)
    'kl_div', # Kullback-Leibler divergence
    'psi', # Population Stability Index
    'jensenshannon',  #  > 1000 Jensen-Shannon distance
    # # 'anderson', # Anderson-Darling test
    'cramer_von_mises', # Cramer-Von-Mises test
    'hellinger', # Hellinger Distance (normed)
    # 'mannw', # Mann-Whitney U-rank test (too long ~23 s on iteration)
    'ed', # Energy distance
    # # 'es', # Epps-Singleton tes
    't_test', # T-Test
    # 'emperical_mmd', # Emperical-MMD (takes too much space to compute on sampled data)
]

In [79]:
rewards = pd.read_csv('data/generated/drift/by_cell_agent/drift_scores_rewards_new_agent_train-test_no_sample20-23.csv', index_col=0)
rewards

Unnamed: 0,cell_id,drift_score,quality_avg,quality_min,quality_max,quality_std,cum_reward_avg,cum_reward_max,cum_reward_std,mom_reward_avg,mom_reward_min,mom_reward_max,mom_reward_std
0,10932,0.875,0.991378,0.973693,0.994549,0.002269,1040.000000,2070.0,598.999165,10.000000,10.0,10.0,0.000000
1,12762,0.875,0.986723,0.790479,0.992018,0.019664,1010.000000,2040.0,598.999165,9.855072,-20.0,10.0,2.085144
2,12781,0.875,0.987593,0.949088,0.992366,0.004489,1040.000000,2070.0,598.999165,10.000000,10.0,10.0,0.000000
3,12782,0.875,0.986304,0.972973,0.992206,0.003253,1040.000000,2070.0,598.999165,10.000000,10.0,10.0,0.000000
4,12783,1.000,0.949899,0.779899,0.991859,0.049240,925.797101,1920.0,558.289010,9.275362,-20.0,10.0,4.617035
...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,5683,0.875,0.992266,0.967622,0.993875,0.002159,800.000000,1590.0,460.434577,10.000000,10.0,10.0,0.000000
990,5682,0.875,0.991784,0.980978,0.994072,0.001335,800.000000,1590.0,460.434577,10.000000,10.0,10.0,0.000000
991,12771,0.875,0.990560,0.978193,0.994569,0.002849,570.000000,1130.0,327.643099,10.000000,10.0,10.0,0.000000
992,12772,1.000,0.968540,0.751103,0.986172,0.028634,550.884956,1100.0,315.678288,9.734513,-20.0,10.0,2.822163


In [80]:
train_df = pd.read_csv('data/train_2020-2023.csv', )
test_df = pd.read_csv('data/test_2020-2023.csv', )

In [81]:
train_df.describe()

Unnamed: 0,Cell ID,Number of Available\nTCH,HR Usage Rate,"TCH Blocking Rate, BH","TCH Traffic (Erl), BH",Param 1,Param 2
count,838168.0,838168.0,838168.0,838168.0,838168.0,838168.0,838168.0
mean,14607.319064,15.421654,59.916053,0.219738,7.451956,33.745386,47.468981
std,12003.337626,8.834072,30.609431,2.661036,7.266931,24.073022,26.284482
min,701.0,0.0,0.0,0.0,0.01,0.0,5.0
25%,5067.0,11.0,34.76,0.0,2.58,17.0,28.0
50%,10931.0,13.0,59.0,0.0,5.25,26.0,43.0
75%,24235.0,20.0,92.0,0.0,9.79,50.0,67.0
max,42857.0,65.0,100.0,100.0,105.62,95.0,100.0


In [82]:
test_df.describe()

Unnamed: 0,Cell ID,Number of Available\nTCH,HR Usage Rate,"TCH Blocking Rate, BH","TCH Traffic (Erl), BH",Param 1,Param 2
count,210035.0,210035.0,210035.0,210035.0,210035.0,210035.0,210035.0
mean,14607.204242,13.710167,66.952841,0.264242,5.106496,23.560787,37.178351
std,12003.274211,6.679496,27.671954,2.511474,4.838647,20.895874,23.67077
min,701.0,0.0,0.0,0.0,0.01,0.0,5.0
25%,5067.0,12.0,45.0,0.0,1.85,9.0,19.0
50%,10931.0,13.0,68.0,0.0,3.73,17.0,29.0
75%,24235.0,19.0,96.0,0.0,6.73,33.0,55.0
max,42857.0,45.0,100.0,95.89,70.16,91.0,100.0


# experiments

## Train | No window | Not Sampled 

In [None]:
save_path = 'data/generated/drift/by_cell_agent/run_7/'
Path(save_path).mkdir(exist_ok=False, parents=True)
# params
sampled_ref: bool = False
sampled_drift: Union[bool, Optional[int]] = None
dist_window: bool = False
weighted: bool = False
regressive: bool = True

drift_calc = DriftCalculator([None] + stat_tests , ['default'] + stat_tests )

drift_scores = []
cols = ['Number of Available\nTCH', 'HR Usage Rate', 'TCH Blocking Rate, BH',
        'TCH Traffic (Erl), BH', 'Param 1', 'Param 2']
# weather sampled train
if sampled_ref:
    ref = train_df.sample(n=1000, random_state=0)[cols]
else:
    ref = train_df[cols]

for cell in tqdm(train_df['Cell ID'].value_counts().keys()[:]):
    # ref = train_df[train_df['Cell ID'] == cell]
    cur = test_df[test_df['Cell ID'] == cell]
    # add original distribution
    if dist_window:
        cur = pd.concat([ref.sample(n=len(ref) - len(cur)), cur])

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        name = f"{save_path}/cell_{cell}_plot_{'sample_ref' if sampled_ref else ''}-{'window-1k' if dist_window else ''}"
        if regressive:
            score = drift_calc.get_drift_regressive(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, save_plot=name, n=8)
        else:
            score = drift_calc.get_drift(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, weighted=weighted, save_plot=name)

    score['cell_id'] = cell
    drift_scores.append(score)

drift_scores_df = pd.DataFrame(drift_scores)
save_name = (f"{save_path}/by_train_{'regressive' if regressive else ''}_"
             f"{'sampled-ref' if sampled_ref else ''}_"
             f"sampled-drift-{sampled_drift}_"
             f"{'window-1k' if dist_window else 'no-window'}_"
             f"{'weighted' if weighted else ''}.csv")
drift_scores_df.to_csv(save_name)
drift_scores_df

In [72]:
drift_scores_df

Unnamed: 0,default,ks,wasserstein,kl_div,psi,jensenshannon,cramer_von_mises,hellinger,ed,t_test,cell_id
0,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",25771
1,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",22944
2,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",26335
3,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",26332
4,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",26336
...,...,...,...,...,...,...,...,...,...,...,...
989,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",5683
990,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",5682
991,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",12772
992,"{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.1, ...","{""HR Usage Rate"": {""stattest_threshold"": 0.05,...",12771


In [73]:
def to_score(x: str) -> float:
    js = json.loads(x)
    size = len(js.items())
    
    return sum([js[f]['drift_detected'] for f, v in js.items()]) / size

In [74]:
ds_df = drift_scores_df[[c for c in drift_scores_df.columns if c != 'cell_id']].apply(lambda x: list(map(to_score, x)))
ds_df['cell_id'] = drift_scores_df.cell_id
ds_df = ds_df[['cell_id'] + list(ds_df.columns)[:-1]]
ds_df

Unnamed: 0,cell_id,default,ks,wasserstein,kl_div,psi,jensenshannon,cramer_von_mises,hellinger,ed,t_test
0,25771,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
1,22944,0.833333,1.000000,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
2,26335,1.000000,1.000000,1.000000,0.833333,0.833333,0.833333,1.0,0.833333,1.000000,0.666667
3,26332,1.000000,1.000000,1.000000,0.833333,0.833333,0.833333,1.0,0.833333,1.000000,0.833333
4,26336,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
...,...,...,...,...,...,...,...,...,...,...,...
989,5683,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
990,5682,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.666667
991,12772,1.000000,1.000000,1.000000,0.833333,0.833333,0.833333,1.0,0.833333,1.000000,0.833333
992,12771,0.833333,1.000000,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333


In [75]:
ds_df.describe()

Unnamed: 0,cell_id,default,ks,wasserstein,kl_div,psi,jensenshannon,cramer_von_mises,hellinger,ed,t_test
count,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0,994.0
mean,14601.775654,0.878773,0.926392,0.878773,0.834172,0.84222,0.842388,1.0,0.842388,0.893025,0.802314
std,11994.634963,0.074257,0.082806,0.074257,0.011797,0.037464,0.037796,0.0,0.037796,0.07995,0.107168
min,701.0,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.333333
25%,5071.25,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
50%,10931.5,0.833333,1.0,0.833333,0.833333,0.833333,0.833333,1.0,0.833333,0.833333,0.833333
75%,24234.5,1.0,1.0,1.0,0.833333,0.833333,0.833333,1.0,0.833333,1.0,0.833333
max,42857.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [76]:
merged = rewards.merge(ds_df, left_on='cell_id', right_on='cell_id', how='inner')
merged.corr()

Unnamed: 0,cell_id,drift_score,quality_avg,quality_min,quality_max,quality_std,cum_reward_avg,cum_reward_max,cum_reward_std,mom_reward_avg,...,default,ks,wasserstein,kl_div,psi,jensenshannon,cramer_von_mises,hellinger,ed,t_test
cell_id,1.0,-0.042054,0.036162,0.018046,-0.001332,-0.023063,0.000967,-0.002974,-0.010015,-0.015267,...,-0.042054,-0.014083,-0.042054,-0.004797,-0.0294,-0.035427,,-0.027563,-0.032091,-0.013158
drift_score,-0.042054,1.0,-0.637122,-0.609597,-0.289934,0.642976,-0.395991,-0.362477,-0.190325,-0.412066,...,1.0,0.34888,1.0,0.116137,0.387639,0.391487,,0.391487,0.781903,0.24057
quality_avg,0.036162,-0.637122,1.0,0.689293,0.462465,-0.782348,0.615313,0.612029,0.471206,0.680864,...,-0.637122,-0.237932,-0.637122,-0.490077,-0.601974,-0.594971,,-0.568248,-0.562093,-0.305755
quality_min,0.018046,-0.609597,0.689293,1.0,0.197964,-0.955504,0.643323,0.588608,0.291974,0.660707,...,-0.609597,-0.056428,-0.609597,-0.164067,-0.547614,-0.547616,,-0.564737,-0.50225,-0.325514
quality_max,-0.001332,-0.289934,0.462465,0.197964,1.0,-0.190559,0.1189,0.117325,0.090388,0.110648,...,-0.289934,-0.168453,-0.289934,-0.067899,-0.125177,-0.119252,,-0.115056,-0.309426,-0.129502
quality_std,-0.023063,0.642976,-0.782348,-0.955504,-0.190559,1.0,-0.760596,-0.714535,-0.413002,-0.800001,...,0.642976,0.09376,0.642976,0.266944,0.676731,0.676923,,0.680283,0.531415,0.357839
cum_reward_avg,0.000967,-0.395991,0.615313,0.643323,0.1189,-0.760596,1.0,0.963393,0.626701,0.936032,...,-0.395991,-0.090844,-0.395991,-0.337963,-0.650346,-0.641243,,-0.646223,-0.318352,-0.293259
cum_reward_max,-0.002974,-0.362477,0.612029,0.588608,0.117325,-0.714535,0.963393,1.0,0.804616,0.884176,...,-0.362477,-0.079346,-0.362477,-0.35704,-0.622673,-0.614825,,-0.607506,-0.291314,-0.26986
cum_reward_std,-0.010015,-0.190325,0.471206,0.291974,0.090388,-0.413002,0.626701,0.804616,1.0,0.53523,...,-0.190325,-0.045423,-0.190325,-0.389688,-0.401941,-0.398805,,-0.366093,-0.15487,-0.135084
mom_reward_avg,-0.015267,-0.412066,0.680864,0.660707,0.110648,-0.800001,0.936032,0.884176,0.53523,1.0,...,-0.412066,-0.100554,-0.412066,-0.373832,-0.685072,-0.677233,,-0.669658,-0.335622,-0.292552


## Train | no ref window | no Sample

In [None]:
save_path = 'data/generated/drift/by_cell_agent/run_8/'
save_path_plot = f'{save_path}/plots/'
Path(save_path_plot).mkdir(exist_ok=False, parents=True)

# params
sampled_ref: bool = False
sampled_drift: Union[bool, Optional[int]] = 1000
dist_window: bool = False
weighted: bool = False
regressive: bool = True

drift_calc = DriftCalculator([None] + stat_tests , ['default'] + stat_tests )

drift_scores = []
cols = ['Number of Available\nTCH', 'HR Usage Rate', 'TCH Blocking Rate, BH',
        'TCH Traffic (Erl), BH', 'Param 1', 'Param 2']
# weather sampled train
if sampled_ref:
    ref = train_df.sample(n=1000, random_state=0)[cols]
else:
    ref = train_df[cols]

for cell in tqdm(train_df['Cell ID'].value_counts().keys()[:]):
    # ref = train_df[train_df['Cell ID'] == cell]
    cur = test_df[test_df['Cell ID'] == cell]
    # add original distribution
    if dist_window:
        cur = pd.concat([ref.sample(n=len(ref) - len(cur)), cur])

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        name = f"{save_path_plot}/cell_{cell}_plot_{'sample_ref' if sampled_ref else ''}-{'window-1k' if dist_window else ''}"
        if regressive:
            score = drift_calc.get_drift_regressive(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, save_plot=name, n=8)
        else:
            score = drift_calc.get_drift(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, weighted=weighted, save_plot=name)

    score['cell_id'] = cell
    drift_scores.append(score)

drift_scores_df = pd.DataFrame(drift_scores)
save_name = (f"{save_path}/by_train_{'regressive' if regressive else ''}_"
             f"{'sampled-ref' if sampled_ref else ''}_"
             f"sampled-drift-{sampled_drift}_"
             f"{'window-1k' if dist_window else 'no-window'}_"
             f"{'weighted' if weighted else ''}.csv")
drift_scores_df.to_csv(save_name)
drift_scores_df

In [None]:
ds_df = drift_scores_df[[c for c in drift_scores_df.columns if c != 'cell_id']].apply(lambda x: list(map(to_score, x)))
ds_df['cell_id'] = drift_scores_df.cell_id
ds_df = ds_df[['cell_id'] + list(ds_df.columns)[:-1]]
ds_df

In [None]:
ds_df.describe()

In [None]:
merged = rewards.merge(ds_df, left_on='cell_id', right_on='cell_id', how='inner')
merged.corr()

## Train | ref window 1000 | no Sample

In [None]:
save_path = 'data/generated/drift/by_cell_agent/run_9/'
save_path_plot = f'{save_path}/plots/'
Path(save_path_plot).mkdir(exist_ok=False, parents=True)

# params
sampled_ref: bool = False
sampled_drift: Union[bool, Optional[int]] = None
dist_window: bool = True
weighted: bool = False
regressive: bool = True

drift_calc = DriftCalculator([None] + stat_tests , ['default'] + stat_tests )

drift_scores = []
cols = ['Number of Available\nTCH', 'HR Usage Rate', 'TCH Blocking Rate, BH',
        'TCH Traffic (Erl), BH', 'Param 1', 'Param 2']
# weather sampled train
if sampled_ref:
    ref = train_df.sample(n=1000, random_state=0)[cols]
else:
    ref = train_df[cols]

for cell in tqdm(train_df['Cell ID'].value_counts().keys()[:]):
    # ref = train_df[train_df['Cell ID'] == cell]
    cur = test_df[test_df['Cell ID'] == cell]
    # add original distribution
    if dist_window:
        cur = pd.concat([ref.sample(n=len(ref) - len(cur)), cur])

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        name = f"{save_path_plot}/cell_{cell}_plot_{'sample_ref' if sampled_ref else ''}-{'window-1k' if dist_window else ''}"
        if regressive:
            score = drift_calc.get_drift_regressive(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, save_plot=name, n=8)
        else:
            score = drift_calc.get_drift(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, weighted=weighted, save_plot=name)

    score['cell_id'] = cell
    drift_scores.append(score)

drift_scores_df = pd.DataFrame(drift_scores)
save_name = (f"{save_path}/by_train_{'regressive' if regressive else ''}_"
             f"{'sampled-ref' if sampled_ref else ''}_"
             f"sampled-drift-{sampled_drift}_"
             f"{'window-1k' if dist_window else 'no-window'}_"
             f"{'weighted' if weighted else ''}.csv")
drift_scores_df.to_csv(save_name)
drift_scores_df

In [None]:
ds_df = drift_scores_df[[c for c in drift_scores_df.columns if c != 'cell_id']].apply(lambda x: list(map(to_score, x)))
ds_df['cell_id'] = drift_scores_df.cell_id
ds_df = ds_df[['cell_id'] + list(ds_df.columns)[:-1]]
ds_df

In [None]:
ds_df.describe()

In [None]:
merged = rewards.merge(ds_df, left_on='cell_id', right_on='cell_id', how='inner')
merged.corr()

## Train | Ref window 1000 | Sampled

In [None]:
save_path = 'data/generated/drift/by_cell_agent/run_10/'
save_path_plot = f'{save_path}/plots/'
Path(save_path_plot).mkdir(exist_ok=False, parents=True)

# params
sampled_ref: bool = False
sampled_drift: Union[bool, Optional[int]] = 1000
dist_window: bool = True
weighted: bool = False
regressive: bool = True

drift_calc = DriftCalculator([None] + stat_tests , ['default'] + stat_tests )

drift_scores = []
cols = ['Number of Available\nTCH', 'HR Usage Rate', 'TCH Blocking Rate, BH',
        'TCH Traffic (Erl), BH', 'Param 1', 'Param 2']
# weather sampled train
if sampled_ref:
    ref = train_df.sample(n=1000, random_state=0)[cols]
else:
    ref = train_df[cols]

for cell in tqdm(train_df['Cell ID'].value_counts().keys()[:]):
    # ref = train_df[train_df['Cell ID'] == cell]
    cur = test_df[test_df['Cell ID'] == cell]
    # add original distribution
    if dist_window:
        cur = pd.concat([ref.sample(n=len(ref) - len(cur)), cur])

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        name = f"{save_path_plot}/cell_{cell}_plot_{'sample_ref' if sampled_ref else ''}-{'window-1k' if dist_window else ''}"
        if regressive:
            score = drift_calc.get_drift_regressive(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, save_plot=name, n=8)
        else:
            score = drift_calc.get_drift(current_data=cur[cols], reference_data=ref[cols], sample=sampled_drift, weighted=weighted, save_plot=name)

    score['cell_id'] = cell
    drift_scores.append(score)

drift_scores_df = pd.DataFrame(drift_scores)
save_name = (f"{save_path}/by_train_{'regressive' if regressive else ''}_"
             f"{'sampled-ref' if sampled_ref else ''}_"
             f"sampled-drift-{sampled_drift}_"
             f"{'window-1k' if dist_window else 'no-window'}_"
             f"{'weighted' if weighted else ''}.csv")
drift_scores_df.to_csv(save_name)
drift_scores_df

In [None]:
ds_df = drift_scores_df[[c for c in drift_scores_df.columns if c != 'cell_id']].apply(lambda x: list(map(to_score, x)))
ds_df['cell_id'] = drift_scores_df.cell_id
ds_df = ds_df[['cell_id'] + list(ds_df.columns)[:-1]]
ds_df

In [None]:
ds_df.describe()

In [None]:
merged = rewards.merge(ds_df, left_on='cell_id', right_on='cell_id', how='inner')
merged.corr()