In [9]:
import numpy as np
import pandas as pd
import pickle


def preprocess(mode, dir_name="dataset/", tau=0.3):
    if mode == "heating_load":
        data = pd.read_excel(f"{dir_name}ENB2012_data.xlsx")
        X = data.values[:, :8]
        y = data.values[:, 8]
    elif mode == "cooling_load":
        data = pd.read_excel(f"{dir_name}ENB2012_data.xlsx")
        X = data.values[:, :8]
        y = data.values[:, 9]
    elif mode == "gas_turbine":
        data = pd.read_csv(f"{dir_name}gt_2015.csv")
        X = data.drop(columns=["TEY"]).values
        y = data["TEY"].values
    elif mode == "red_wine":
        data = pd.read_csv(f"{dir_name}winequality-red.csv", delimiter=";")
        X = data.values[:, :-1]
        y = data.values[:, -1]
    elif mode == "white_wine":
        data = pd.read_csv(f"{dir_name}winequality-white.csv", delimiter=";")
        X = data.values[:, :-1]
        y = data.values[:, -1]
    elif mode == "abalone":
        data = pd.read_csv(f"{dir_name}abalone.csv", header=None)
        X = data.drop(columns=0).values[:, :-1]
        y = data.values[:, -1]
    elif mode == "concrete":
        data = pd.read_excel(f"{dir_name}Concrete_Data.xls")
        X = data.values[:, :-1]
        y = data.values[:, -1]
    elif mode == "housing":
        data = pd.read_excel(f"{dir_name}Real estate valuation data set.xlsx")
        X = data.values[:, :-1]
        y = data.values[:, -1]

    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    rng = np.random.default_rng(0)
    for j in range(X.shape[1]):
        if len(set(X[:, j])) < tau * len(X[:, j]):
            X[:, j] += rng.normal(0, 0.1, X.shape[0])
    y = np.array(y, dtype=float)
    residuals = y - X @ np.linalg.inv(X.T @ X) @ X.T @ y
    # print(np.std(y), np.std(residuals, ddof=X.shape[1]))
    y = (y - np.mean(y)) / np.std(residuals, ddof=X.shape[1])
    return X, y

dataset = {}
keys = ["heating_load", "cooling_load", "gas_turbine", "red_wine", "white_wine", "abalone", "concrete", "housing"]
for key in keys:
    dataset[key] = preprocess(key)
with open("table_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)
