In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read data
# https://archive.ics.uci.edu/dataset/186/wine+quality
data_raw = pd.read_csv("../data/winequality-white.csv", sep=';')
data_raw.head(5)

In [None]:
# Data normalization (Min-Max Normalization)
data_norm = (data_raw - data_raw.min())/(data_raw.max() - data_raw.min())
data_norm.head(5)

In [None]:
# Get the shape of Dataset
n_rows = data_norm.shape[0]
n_cols = data_norm.shape[1]

print(n_cols, n_rows)

In [None]:
# Simple fuction to generate query
def generate_query():
    x1 = np.random.rand()
    y1 = np.random.rand()
    x2 = np.random.rand()
    y2 = np.random.rand()
    if (x1 > x2):
        x1, x2 = x2, x1 
    if (y1 > y2):
        y1, y2 = y2, y1
    return x1, y1, x2, y2 

In [None]:
def calculate_hist(x, y, bins):
    hist = np.zeros((bins, bins), dtype=np.int64)
    for i in range(x.shape[0]):
        X = np.trunc(bins * x[i]).astype(np.int64)
        if (X == bins):
            X = X - 1 
        Y = np.trunc(bins * y[i]).astype(np.int64)
        if (Y == bins):
            Y = Y - 1
        hist[X][Y] = hist[X][Y] + 1 
    return hist

In [None]:
def query_quantization(x, bins):
    X = x * bins
    nX = np.trunc(X).astype(np.int64)
    if (nX == bins):
        nX = nX - 1
    rX = (X - nX).astype(np.float64)
    return nX, rX

In [None]:
def label_count(x, y, qx, qy):
    cnt = np.int64(0)
    for i in range(x.shape[0]):
        if (x[i] <= qx and y[i] <= qy):
            cnt = cnt + 1
    return cnt

In [None]:
n_bins = 8 
n_quries = 100

data_out = np.empty((0, n_bins * n_bins + 5))

for i in range(n_cols - 1):
    for j in range(i + 1, n_cols):
        sample = data_norm.iloc[:, [i, j]]
        x = sample.iloc[:, 0].to_numpy()
        y = sample.iloc[:, 1].to_numpy()
        hist = calculate_hist(x, y, bins=n_bins).reshape(-1)
        for k in range(n_quries):
            qx = np.random.rand()
            qy = np.random.rand()
            qX, rX = query_quantization(qx, bins=n_bins)
            qY, rY = query_quantization(qy, bins=n_bins)
            label = label_count(x, y, qx, qy)
            new_row = np.append(hist, [qX, qY, rX, rY, label])
            data_out = np.append(data_out, [new_row], axis=0)


In [None]:
column_names = np.append(np.array(range(n_bins * n_bins)), ["qX", "qY", "rX", "rY", "label"])
data_df = pd.DataFrame(data=data_out, columns=column_names)
data_df.to_csv("../data/winequality-white.processed.csv")