In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data
# https://archive.ics.uci.edu/dataset/186/wine+quality
data_raw = pd.read_csv("../data/winequality-white.csv", sep=';')
data_raw.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# Data normalization (Min-Max Normalization)
data_norm = (data_raw - data_raw.min())/(data_raw.max() - data_raw.min())
data_norm.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.307692,0.186275,0.216867,0.308282,0.106825,0.149826,0.37355,0.267785,0.254545,0.267442,0.129032,0.5
1,0.240385,0.215686,0.204819,0.015337,0.118694,0.041812,0.285383,0.132832,0.527273,0.313953,0.241935,0.5
2,0.413462,0.196078,0.240964,0.096626,0.121662,0.097561,0.204176,0.154039,0.490909,0.255814,0.33871,0.5
3,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,0.5
4,0.326923,0.147059,0.192771,0.121166,0.145401,0.156794,0.410673,0.163678,0.427273,0.209302,0.306452,0.5


In [4]:
# Get the shape of Dataset
n_rows = data_norm.shape[0]
n_cols = data_norm.shape[1]

print(n_cols, n_rows)

12 4898


In [5]:
# Simple fuction to generate query
def generate_query():
    x1 = np.random.rand()
    y1 = np.random.rand()
    x2 = np.random.rand()
    y2 = np.random.rand()
    if (x1 > x2):
        x1, x2 = x2, x1 
    if (y1 > y2):
        y1, y2 = y2, y1
    return x1, y1, x2, y2 

In [6]:
def calculate_hist(x, y, tot, bins):
    hist = np.zeros((bins, bins), dtype=np.float64)
    for i in range(x.shape[0]):
        X = np.trunc(bins * x[i]).astype(np.int64)
        if (X == bins):
            X = X - 1 
        Y = np.trunc(bins * y[i]).astype(np.int64)
        if (Y == bins):
            Y = Y - 1
        hist[X][Y] = hist[X][Y] + 1 
    hist = hist / tot
    return hist

In [7]:
def query_quantization(x, bins):
    X = x * bins
    nX = np.trunc(X).astype(np.int64)
    if (nX == bins):
        nX = nX - 1
    rX = (X - nX).astype(np.float64)
    return nX, rX

In [8]:
def label_count(x, y, qx, qy, tot):
    cnt = np.int64(0)
    for i in range(x.shape[0]):
        if (x[i] <= qx and y[i] <= qy):
            cnt = cnt + 1
    return cnt / tot

In [9]:
def calculate_prefix(hist):
    prefix = np.zeros(hist.shape)
    for i in range(prefix.shape[0]):
        for j in range(prefix.shape[0]):
            if i == 0: 
                if j == 0:
                    prefix[i][j] = hist[i][j]
                else:
                    prefix[i][j] = prefix[i][j - 1] + hist[i][j]
            else:
                if j == 0:
                    prefix[i][j] = prefix[i - 1][j] + hist[i][j]
                else:
                    prefix[i][j] = prefix[i][j - 1] + prefix[i - 1][j] - prefix[i - 1][j - 1] + hist[i][j]
    return prefix

In [10]:
def calculate_prefix_sum(prefix, x, y):
    if x == 0 or y == 0:
        return 0
    else:
        return prefix[x - 1][y - 1]

In [11]:
def calculate_baseline(prefix, qx, qy, rx, ry):
    return (1 - rx) * (1 - ry) * calculate_prefix_sum(prefix, qx, qy) +\
           rx * (1 - ry) * calculate_prefix_sum(prefix, qx + 1, qy) +\
           ry * (1 - rx) * calculate_prefix_sum(prefix, qx, qy + 1) +\
           rx * ry * calculate_prefix_sum(prefix, qx + 1, qy + 1)

In [12]:
def build_dataset(data, n_bins, n_quries):
    n_cols = data.shape[1]
    data_out = np.empty((0, n_bins * n_bins + 6))

    for i in range(n_cols - 1):
        for j in range(i + 1, n_cols):
            sample = data.iloc[:, [i, j]]
            x = sample.iloc[:, 0].to_numpy()
            y = sample.iloc[:, 1].to_numpy()
            tot = x.shape[0]
            hist = calculate_hist(x, y, bins=n_bins, tot=tot)
            prefix = calculate_prefix(hist)
            for k in range(n_quries):
                qx = np.random.rand()
                qy = np.random.rand()
                qX, rX = query_quantization(qx, bins=n_bins)
                qY, rY = query_quantization(qy, bins=n_bins)
                label = label_count(x, y, qx, qy, tot=tot)
                baseline = calculate_baseline(prefix, qX, qY, rX, rY)
                new_row = np.append(hist.reshape(-1), [qX, qY, rX, rY, label, baseline])
                data_out = np.append(data_out, [new_row], axis=0)
    return data_out

In [13]:
n_bins = 8
n_quries = 1000

data_out = build_dataset(data_norm, n_bins, n_quries)

column_names = np.append(np.array(range(n_bins * n_bins)), ["qX", "qY", "rX", "rY", "label", "baseline"])
data_df = pd.DataFrame(data=data_out, columns=column_names)
data_df.to_csv("../data/winequality-white.processed.csv")

In [17]:
data_rand = pd.DataFrame(np.random.rand(1000, 5))

data_rand_out = build_dataset(data_rand, n_bins, n_quries)

data_rand_df = pd.DataFrame(data=data_rand_out, columns=column_names)
data_rand_df.to_csv("../data/winequality-white.randTest.csv")