<a href="https://colab.research.google.com/github/RaduW/volume-rebalance/blob/main/volume_rebalancing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Volume Rebalancing Algorithm


The volume rebalancing algorithm is based on the assumption that while given a global sample rate classes we want to adjust the individual class sampling rate in a way that equalises the number of sampled elements in each class while maintaining the overall sample rate.

In [1]:
url = "https://raw.githubusercontent.com/RaduW/volume-rebalance/main/transaction_adjustment_model.py"
!wget --no-cache --backups=1 {url}

from  transaction_adjustment_model import adjust_sample_rate

utils_url = "https://raw.githubusercontent.com/RaduW/volume-rebalance/main/utils.py"
!wget --no-cache --backups=1 {utils_url}

import utils

--2023-03-30 15:00:06--  https://raw.githubusercontent.com/RaduW/volume-rebalance/main/transaction_adjustment_model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5943 (5.8K) [text/plain]
Saving to: ‘transaction_adjustment_model.py’


2023-03-30 15:00:06 (33.5 MB/s) - ‘transaction_adjustment_model.py’ saved [5943/5943]

--2023-03-30 15:00:06--  https://raw.githubusercontent.com/RaduW/volume-rebalance/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 943 [text/plain]
Sa

In [2]:

from operator import itemgetter
from typing import Tuple, Union
from ipywidgets import interact, widgets
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")


In [3]:
# import transaction data
projects = "https://raw.githubusercontent.com/RaduW/volume-rebalance/main/projects-2023-03-30.json"
trans_data = pd.read_json(projects)

PROJECTS = {
    "sentry": 1,
    "javascript": 11276,
    "snuba": 300688,
    "gibpotato-backend": 4504044639748096,
    #"gibpotato-frontend": 4504044640927744,
}

trans_data.sort_values(by=["proj_id","freq"],ignore_index=True, ascending=True, inplace=True)
trans_data

Unnamed: 0,name,freq,proj_id,proj_name
0,/api/0/customers/,1,1,sentry
1,/organizations/:orgId/alerts/rules/details/:ru...,1,1,sentry
2,/api/0/organizations/{organization_slug}/auth-...,1,1,sentry
3,//index.php,1,1,sentry
4,/api/0/users/{user_id}/,1,1,sentry
...,...,...,...,...
1121,HEAD /login,1,4504044639748096,gibpotato-backend
1122,GET /QMVi,1,4504044639748096,gibpotato-backend
1123,HEAD /,13,4504044639748096,gibpotato-backend
1124,GET /login,28,4504044639748096,gibpotato-backend


In [4]:
rules_file_name = "https://raw.githubusercontent.com/RaduW/volume-rebalance/main/rules-2023-03-30.json"
rules_data = utils.json_rules_to_frame(rules_file_name)
rules_data

Unnamed: 0,factor,name
0,1.362553,/auth/login/{organization_slug}/
1,1.343255,sentry.tasks.files.delete_file
2,1.233424,sentry.tasks.check_auth_identity
3,1.029705,/dashboard/:dashboardId/
4,0.976686,sentry.profiles.task.process_profile
5,0.898527,sentry.tasks.relay.build_project_config
6,0.882867,/api/0/organizations/{organization_slug}/stats...
7,0.834599,/api/0/projects/{organization_slug}/{project_s...
8,0.782446,sentry.tasks.files.delete_unreferenced_blobs
9,0.621138,/api/0/projects/{organization_slug}/{project_s...


In [5]:


    
def counts_to_labeled_counts( counts: Union[float,int,Tuple[str,float]]):
  counts = sorted(counts)
  return [(f"t-{idx}",count) for idx,count in enumerate(counts)]

# Model params

The model has the following input parameters:

*   a list of initial counts representing the number of elements for each class `counts`
*   an overall desired sample rate: `global_rate` (input via slider)



In [6]:
def process_data(global_rate:float, items_high:int, items_low:int, project:str, trans_data: pd.DataFrame):

    data = trans_data.sort_values(by=["freq"],ignore_index=True, ascending=True)
    data = data[data["proj_name"] == project]

    counts = data["freq"]
    ideal_rate = counts.mean() * global_rate
    cnts_min = counts.min()
    cnts_max = counts.max()
    total = counts.sum()
    num_classes = len(counts)

    data = data.set_index("name", drop=False)
    data["original"] = data["freq"]* global_rate
    if items_low + items_high < num_classes:
      if items_high == 0:
        explicit_transactions = data[:items_low]
      else:
        explicit_transactions = pd.concat([data[:items_low], data[-items_high:]])
    else:
      explicit_transactions = data # we resize everything explicitly

    explicit_transactions_tuple = list(explicit_transactions[["name", "freq"]].itertuples(name=None, index=False))
    adjusted, implicit_rate = adjust_sample_rate(classes=explicit_transactions_tuple, rate=global_rate, total_num_classes=num_classes, total=total)
    adjusted_df = pd.DataFrame.from_dict(data = adjusted, orient="index", columns=["adjusted_rate"] )

    # now calculate the adjusted count
    df = data.join(adjusted_df, how="outer")

    # # keep count of the explicit rates
    df["explicit"]= ~df['adjusted_rate'].isna()
    # # set the adjusted rates for the implicit values 
    df["adjusted_rate"] = df["adjusted_rate"].fillna(implicit_rate)

    df["adjusted"] = df["freq"]* df["adjusted_rate"]

    print( f"Num classes:{num_classes} implicit_rate:{implicit_rate} original_rate:{global_rate}")
    return df  



def plot_rates(ax, data, last_low, first_high, global_rate, x_limit=None, log=False):
  if x_limit is not None:
    ax.set_xlim(*x_limit)
    data = data[data["freq"]<x_limit[1]]

  if log:
    plt.xscale("log")
    plt.yscale("log")
  else:
    plt.xscale("linear")
    plt.yscale("linear")


  counts_series = data["freq"]

  cnts_min = counts_series.min()
  cnts_max =  counts_series.max()
  ideal_rate = counts_series.mean() * global_rate
  rate_max = data.rate.max()

  if not log:
    #ideal level
    ax.text((cnts_min+cnts_max)/2, ideal_rate, "ideal rate", horizontalalignment='center', verticalalignment="bottom", size='medium', color='black')
    sns.lineplot( x=[cnts_min, cnts_max], y=[ideal_rate,ideal_rate], ax = ax);
    # border lower values
    sns.lineplot( x=[last_low,last_low+0.001], y=[0, rate_max], color="blue", ax=ax);
    # border higher values
    sns.lineplot( x=[first_high,first_high+0.001], y=[0, rate_max], color="red", ax=ax, );

  sns.scatterplot(data=data, x="freq", y="rate", hue ="series", style="explicit", ax=ax)

def draw_rate_change(ax, df):
  def minor_tick_format(tick_val, tick_pos):
    if tick_val < 0:
      tick_val *= -1
    while tick_val < 1:
      tick_val *=10
    while tick_val > 10:
      tick_val /=10
    return f"{int(tick_val)%10}"
    

  ax.set_yscale("log")
  ax.set_xscale("log")
  ax.grid(which='minor')
  sns.scatterplot(data=df, x="freq", y="adjusted_rate", hue ="explicit", style="explicit", ax=ax)
  ax.xaxis.set_minor_formatter(minor_tick_format)
  ax.set_title("Sample Rate")


def draw_rebalance_graphs(global_rate, items_high, items_low, project, trans_data):

    df = process_data(global_rate, items_high, items_low, project, trans_data)

    counts_series = df["freq"]

    last_low = counts_series.iloc[items_low]
    first_high = counts_series.iloc[-items_high]

    df2 = df.melt(id_vars=["freq","explicit"],value_vars=["adjusted","original"], var_name="series", value_name = "rate")
    fig, ax = plt.subplots(nrows = 4 ,figsize=(20,20))
    
    draw_rate_change(ax[0], df)
    plot_rates(ax[1], df2, last_low, first_high, global_rate)
    plot_rates(ax[2], df2, last_low, first_high, global_rate, x_limit=(-20, 220))
    plot_rates(ax[3], df2, last_low, first_high, global_rate, log=True)





### Counts

In [7]:
trans_data.head()

Unnamed: 0,name,freq,proj_id,proj_name
0,/api/0/customers/,1,1,sentry
1,/organizations/:orgId/alerts/rules/details/:ru...,1,1,sentry
2,/api/0/organizations/{organization_slug}/auth-...,1,1,sentry
3,//index.php,1,1,sentry
4,/api/0/users/{user_id}/,1,1,sentry


### Sample rate

In [8]:
global_rate=widgets.FloatSlider(min=0,max=1,value=0.1, step=0.001 , description= "sample rate");
num_classes = len(trans_data)
items_high=widgets.BoundedIntText(min=0,max=num_classes,value=20,
                                  description="items high");
items_low =widgets.BoundedIntText(min=0,max=num_classes,value=20,step=1,
    description='items low')

project_names = list(PROJECTS.keys())

project= widgets.Dropdown(
    options=project_names,
    value='sentry',
    description='Project:',
)
#global_rate = widgets.FloatText(min=0,max=1,value=0.1, step=0.001 )
def rebalance_generator(trans_data):
  def inner( global_rate, items_high, items_low, project):
    draw_rebalance_graphs(global_rate, items_high,items_low, project, trans_data)
  return inner

widgets.interact(rebalance_generator(trans_data), global_rate = global_rate, items_high=items_high, items_low=items_low, project=project);

interactive(children=(FloatSlider(value=0.1, description='sample rate', max=1.0, step=0.001), BoundedIntText(v…

In [51]:
items_high = 10
items_low = 20
rate = 0.1

data = process_data(rate, items_high, items_low, "sentry", trans_data)
implicit_factor= rules_data.loc[ rules_data["name"]=="*"].iloc[0]["factor"]
rules_series = rules_data.set_index("name", drop=True)["factor"]

full_data = data.join(rules_series, how="left")
full_data["factor"] = full_data["factor"].fillna(implicit_factor)
full_data


Num classes:600 implicit_rate:0.27972316050849144 original_rate:0.1


Unnamed: 0,name,freq,proj_id,proj_name,original,adjusted_rate,explicit,adjusted,factor
,,2897,1,sentry,289.7,0.279723,False,810.357996,30.609375
/,/,2737,1,sentry,273.7,0.279723,False,765.602290,30.609375
/*,/*,300,1,sentry,30.0,0.279723,False,83.916948,30.609375
/.*/,/.*/,597,1,sentry,59.7,0.279723,False,166.994727,30.609375
/.well-known/dnt-policy.txt,/.well-known/dnt-policy.txt,115,1,sentry,11.5,0.279723,False,32.168163,30.609375
...,...,...,...,...,...,...,...,...,...
sentry.tasks.store.symbolicate_event,sentry.tasks.store.symbolicate_event,3777,1,sentry,377.7,0.279723,False,1056.514377,30.609375
sentry.tasks.store.symbolicate_event_low_priority,sentry.tasks.store.symbolicate_event_low_priority,1,1,sentry,0.1,0.279723,False,0.279723,30.609375
sentry.tasks.update_code_owners_schema,sentry.tasks.update_code_owners_schema,31,1,sentry,3.1,0.279723,False,8.671418,30.609375
sentry.tasks.user_report,sentry.tasks.user_report,1,1,sentry,0.1,0.279723,False,0.279723,30.609375


In [35]:
rules_data.head(50)

Unnamed: 0,factor,name
0,1.362553,/auth/login/{organization_slug}/
1,1.343255,sentry.tasks.files.delete_file
2,1.233424,sentry.tasks.check_auth_identity
3,1.029705,/dashboard/:dashboardId/
4,0.976686,sentry.profiles.task.process_profile
5,0.898527,sentry.tasks.relay.build_project_config
6,0.882867,/api/0/organizations/{organization_slug}/stats...
7,0.834599,/api/0/projects/{organization_slug}/{project_s...
8,0.782446,sentry.tasks.files.delete_unreferenced_blobs
9,0.621138,/api/0/projects/{organization_slug}/{project_s...


# Scratch pad below

Ignore....
