In [97]:
import pandas as pd 
import numpy as np

In [98]:
nbins = 16
test = "test6"
fdata = "test/" + test + "_data.csv"
fquery = "test/" + test + "_query.csv"

In [99]:
data = pd.read_csv(fdata)
data.head()

Unnamed: 0,QUANTITY,TAX
0,17,0.02
1,36,0.06
2,8,0.02
3,28,0.06
4,24,0.04


In [100]:
query = pd.read_csv(fquery)
query.head()

Unnamed: 0,lb1,lb2,ub1,ub2,GT
0,19.155929,0.02118,31.666566,0.033354,8139.0
1,13.940659,0.061838,47.710976,0.075778,23468.0
2,2.573148,0.054048,47.615124,0.066391,31066.0
3,2.582024,0.008924,15.99305,0.037829,26992.0
4,28.299891,0.05443,30.270209,0.072882,2709.0


In [101]:
data_max = data.max()
data_min = data.min()
data = (data - data_min) / (data_max - data_min)
data.head()

Unnamed: 0,QUANTITY,TAX
0,0.326531,0.25
1,0.714286,0.75
2,0.142857,0.25
3,0.55102,0.75
4,0.469388,0.5


In [102]:
def calculate_hist(x, y, bins):
    hist = np.zeros((bins, bins), dtype=np.float64)
    for i in range(x.shape[0]):
        X = np.trunc(bins * x[i]).astype(np.int64)
        if (X == bins):
            X = X - 1 
        Y = np.trunc(bins * y[i]).astype(np.int64)
        if (Y == bins):
            Y = Y - 1
        hist[X][Y] = hist[X][Y] + 1 
    return hist

In [103]:
def query_quantization(x, bins):
    X = x * bins
    nX = np.trunc(X).astype(np.int64)
    if (nX == bins):
        nX = nX - 1
    rX = (X - nX).astype(np.float64)
    return nX, rX

In [104]:
def calculate_prefix(hist):
    prefix = np.zeros(hist.shape)
    for i in range(prefix.shape[0]):
        for j in range(prefix.shape[0]):
            if i == 0: 
                if j == 0:
                    prefix[i][j] = hist[i][j]
                else:
                    prefix[i][j] = prefix[i][j - 1] + hist[i][j]
            else:
                if j == 0:
                    prefix[i][j] = prefix[i - 1][j] + hist[i][j]
                else:
                    prefix[i][j] = prefix[i][j - 1] + prefix[i - 1][j] - prefix[i - 1][j - 1] + hist[i][j]
    return prefix

In [105]:
def calculate_prefix_sum(prefix, x, y):
    if x == 0 or y == 0:
        return 0
    else:
        return prefix[x - 1][y - 1]

In [106]:
def calculate_baseline(prefix, qx, qy, rx, ry):
    return (1 - rx) * (1 - ry) * calculate_prefix_sum(prefix, qx, qy) +\
           rx * (1 - ry) * calculate_prefix_sum(prefix, qx + 1, qy) +\
           ry * (1 - rx) * calculate_prefix_sum(prefix, qx, qy + 1) +\
           rx * ry * calculate_prefix_sum(prefix, qx + 1, qy + 1)

In [107]:
def point_query(x, y, prefix, bins):
    qx, rx = query_quantization(x, bins)
    qy, ry = query_quantization(y, bins)
    return calculate_baseline(prefix, qx, qy, rx, ry)

In [108]:
hist = calculate_hist(data.iloc[:, 0], data.iloc[:, 1], nbins)
prefix = calculate_prefix(hist)

Qerr = []

for i in range(1000):
    lb1 = query.iloc[i]["lb1"]
    lb2 = query.iloc[i]["lb2"]
    ub1 = query.iloc[i]["ub1"]
    ub2 = query.iloc[i]["ub2"]
    GT = query.iloc[i]["GT"]

    lb1 = (lb1 - data_min) / (data_max - data_min)
    lb2 = (lb2 - data_min) / (data_max - data_min)
    ub1 = (ub1 - data_min) / (data_max - data_min)
    ub2 = (ub2 - data_min) / (data_max - data_min)

    lb1 = lb1.iloc[0].item()
    lb2 = lb2.iloc[1].item()
    ub1 = ub1.iloc[0].item()
    ub2 = ub2.iloc[1].item()
    GT = GT.item()

    est = point_query(ub1, ub2, prefix, nbins)
    est = est - point_query(lb1, ub2, prefix, nbins)
    est = est - point_query(ub1, lb2, prefix, nbins)
    est = est + point_query(lb1, lb2, prefix, nbins)

    est = int(np.trunc(est))
    est = max(est, 10)
    GT = max(GT, 10)
    qerr = max(GT / est, est / GT)
    # print(f"{est}, {GT}, {qerr}")
    Qerr.append(np.log(qerr))

print(f"GMQ: {np.exp(np.mean(Qerr))}, 95th: {np.exp(np.percentile(Qerr, 95))}")

GMQ: 2.1484733003502083, 95th: 289.5403174650215
