<a href="https://colab.research.google.com/github/RaduW/volume-rebalance/blob/main/volume_rebalancing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Volume Rebalancing Algorithm


The volume rebalancing algorithm is based on the assumption that while given a global sample rate classes we want to adjust the individual class sampling rate in a way that equalises the number of sampled elements in each class while maintaining the overall sample rate.

In [1]:
from ipywidgets import interact, widgets
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [2]:
import operator
from collections import namedtuple
from copy import copy
from typing import List, Mapping, MutableMapping, Tuple


def adjust_sample_rate(
    transactions: List[Tuple[str, int]], rate: float) -> Mapping[str, float]:
    """
    Calculates transaction sample size in order to maximize the number of small transactions

    :param transactions: the transaction types as an array of (name, count) tuples
    :param rate: the overall desired rate
    """
    # sort by transaction count
    transactions = sorted(transactions, key=operator.itemgetter(1))
    return adjust_sample_rate_full(transactions, rate)


def adjust_sample_rate_full(
    transactions: List[Tuple[str, int]], rate: float
) -> MutableMapping[str, float]:
    """
    resample all transactions to their ideal size
    """
    transactions = copy(transactions)
    ret_val = {}
    num_transactions = total_transactions(transactions)
    # calculate how many transactions we are allowed to keep overall
    # this will allow us to pass transactions between different transaction types
    total_budget = num_transactions * rate
    while transactions:
        num_types = len(transactions)
        # We recalculate the budget per type every iteration to
        # account for the cases where, in the previous step we couldn't
        # spend all the allocated budget for that type.
        budget_per_transaction_type = total_budget / num_types
        name, count = transactions.pop(0)
        if count < budget_per_transaction_type:
            # we have fewer transactions in this type than the
            # budget, all we can do is to keep everything
            ret_val[name] = 1.0  # not enough samples, use all
            total_budget -= count
        else:
            # we have enough transactions in current the class
            # we want to only keep budget_per_transactions
            transaction_rate = budget_per_transaction_type / count
            ret_val[name] = transaction_rate
            total_budget -= budget_per_transaction_type
    return ret_val

def total_transactions(transactions: List[Tuple[str, int]]) -> int:
    ret_val = 0
    for _, v in transactions:
        ret_val += v
    return ret_val

def counts_to_labeled_counts( counts):
  counts = sorted(counts)
  return [(f"t-{idx}",count) for idx,count in enumerate(counts)]

# Model params

The model has the following input parameters:

*   a list of initial counts representing the number of elements for each class `counts`
*   an overall desired sample rate: `global_rate` (input via slider)



In [3]:
def process_data(global_rate, counts):
    cnts = np.array(counts)
    ideal_rate = np.average(cnts) * global_rate
    cnts_min = cnts.min()
    cnts_max = cnts.max()

    transactions = counts_to_labeled_counts(counts)
    labels = [ x[0] for x in transactions]

    sampled_counts = cnts * global_rate

    original_df = pd.DataFrame(data ={"counts": cnts, "original": sampled_counts}, index = labels)
    adjusted = adjust_sample_rate(transactions, global_rate)
    adjusted_df = pd.DataFrame.from_dict(data = adjusted, orient="index", columns=["adjusted_rate"] )

    # now calculate the adjusted count
    df = original_df.join(adjusted_df, how="outer")

    df["adjusted"] = df["counts"]* df["adjusted_rate"]
    return df  

def draw_rebalance_graphs(global_rate, counts):
    
    df = process_data(global_rate, counts)

    counts_series = df["counts"]

    cnts_min = counts_series.min()
    cnts_max =  counts_series.max()
    ideal_rate = counts_series.mean() * global_rate

    df2 = df.melt(id_vars=["counts"],value_vars=["adjusted","original"], var_name="series", value_name = "rate")
    fig, ax = plt.subplots(nrows = 2 ,figsize=(20,16))
    # ax.xaxis.grid(True, which='both')

    plt.xscale("linear")
    plt.yscale("linear")

    sns.lineplot( x=[cnts_min, cnts_max], y=[ideal_rate,ideal_rate], ax = ax[0]);
    sns.scatterplot(data=df2, x="counts", y="rate", hue ="series", style="series", ax=ax[0]).set(title = "Sampling Adjustment")

    plt.xscale("log")
    plt.yscale("log")

    #sns.lineplot( x=[cnts_min, cnts_max], y=[ideal_rate,ideal_rate], ax = ax[1]);
    sns.scatterplot(data=df2, x="counts", y="rate", hue ="series", style="series", ax=ax[1]).set(title = "Sampling Adjustment (Log Scale)")




### Counts

In [7]:
counts = [10,500.37,  3999.88, 7000, 9000]

### Sample rate

In [8]:
global_rate=widgets.FloatSlider(min=0,max=1,value=0.1, step=0.001 );
#global_rate = widgets.FloatText(min=0,max=1,value=0.1, step=0.001 )
def rebalance_generator(counts):
  def inner( global_rate):
    draw_rebalance_graphs(global_rate, counts)
  return inner

widgets.interact(rebalance_generator(counts), global_rate = global_rate);

interactive(children=(FloatSlider(value=0.1, description='global_rate', max=1.0, step=0.001), Output()), _dom_…

In [6]:
print(f"Global Rate is {global_rate.value}")
process_data(global_rate.value, counts)


Global Rate is 0.1


Unnamed: 0,counts,original,adjusted_rate,adjusted
t-0,10,1.0,1.0,10.0
t-1,500,50.0,1.0,500.0
t-2,3999,399.9,0.12844,513.633333
t-3,7000,700.0,0.073376,513.633333
t-4,9000,900.0,0.05707,513.633333


# Scratch pad below

Ignore....