In [2]:
import pandas as pd
import numpy as np

In [3]:
# Synthetic data
features = np.concatenate(
    (np.load("data/synthetic/X_train.npy").T, np.load("data/synthetic/X_test.npy").T),
    axis=0,
)
vectors = np.concatenate(
    (np.load("data/synthetic/y_train.npy"), np.load("data/synthetic/y_test.npy")),
    axis=0,
)
df = np.concatenate((features, vectors), axis=1)
df = pd.DataFrame(df, index=None, columns=None)
print(df.head(5))
df.to_csv("data/synthetic.csv", header=False, index=False)

          0         1    2
0  0.562178  0.342779  1.0
1  0.965500 -0.180960  1.0
2  0.812396  0.088968  1.0
3  0.719061  0.148323  1.0
4  0.409213 -0.138452  1.0


In [4]:
# Breast cancer dataset
df = pd.read_csv("data/breast_cancer/breast-cancer.data", header=None, index_col=None)
for idx in range(df.shape[1] - 1):
    try:
        float(df[idx][0])
    except ValueError:
        df[idx] = pd.factorize(df[idx])[0]

    col = df[idx].to_numpy()
    min_ = np.min(col)
    max_ = np.max(col)
    col = (col - min_) / (max_ - min_ + 1e-9)
    df[idx] = pd.Series(col, index=None)

df[df.shape[1] - 1] = df[df.shape[1] - 1].replace({"no": -1, "yes": 1})
# df = df.to_numpy()
# df[:, :-1] = df[:, :-1]/np.max(np.linalg.norm(df[:, :-1], axis=1))
# df = pd.DataFrame(df, index=None, columns=None)

if not isinstance(df, pd.DataFrame):
    print("Error")
    exit()
print(df.head(5))
df.to_csv("data/bc.csv", header=False, index=False)

     0    1    2    3    4    5    6    7    8  9
0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0 -1
1  0.0  0.2  0.0  0.1  0.0  0.0  0.5  1.0  0.2 -1
2  0.0  0.2  0.0  0.1  0.0  0.0  0.5  0.0  0.0 -1
3  0.0  0.4  0.5  0.2  0.0  0.0  0.5  1.0  0.4 -1
4  0.0  0.2  0.0  0.3  0.0  0.0  0.5  1.0  0.6 -1


  df[df.shape[1] - 1] = df[df.shape[1] - 1].replace({"no": -1, "yes": 1})


In [5]:
# Iris dataset
df = pd.read_csv("data/iris/iris.data", header=None, index_col=None)
df = df[(df[df.shape[1] - 1] != "Iris-versicolor")]
df = pd.DataFrame(df.to_numpy(), columns=None, index=None)
for idx in range(df.shape[1] - 1):
    try:
        float(df[idx][0])
    except ValueError:
        df[idx] = pd.factorize(df[idx])[0]

    col = df[idx].to_numpy()
    min_ = np.min(col)
    max_ = np.max(col)
    col = (col - min_) / (max_ - min_ + 1e-9)
    df[idx] = pd.Series(col, index=None)

df[df.shape[1] - 1] = df[df.shape[1] - 1].replace(
    {"Iris-setosa": 1, "Iris-virginica": -1}
)
df = df.to_numpy().astype(np.float32)
df[:, :-1] = df[:, :-1] / np.max(np.linalg.norm(df[:, :-1], axis=1))
df = pd.DataFrame(df, index=None, columns=None)

if not isinstance(df, pd.DataFrame):
    print("Error")
    exit()
print(df.head(5))
df.to_csv("data/iris.csv", header=False, index=False)

          0         1         2        3    4
0  0.125811  0.334543  0.038383  0.02359  1.0
1  0.094358  0.205873  0.038383  0.02359  1.0
2  0.062906  0.257341  0.028787  0.02359  1.0
3  0.047179  0.231607  0.047979  0.02359  1.0
4  0.110085  0.360277  0.038383  0.02359  1.0


  df[df.shape[1] - 1] = df[df.shape[1] - 1].replace(


In [6]:
# Parkinsons dataset
df = pd.read_csv("data/parkinsons/parkinsons.data", header=0, index_col=None)
new_cols = list(df.columns)
new_cols.remove("status")
new_cols += ["status"]
df = df.reindex(columns=new_cols)
df.columns = list(range(df.shape[1]))
for idx in range(df.shape[1] - 1):
    try:
        float(df[idx][0])
    except ValueError:
        df[idx] = pd.factorize(df[idx])[0]

    col = df[idx].to_numpy()
    min_ = np.min(col)
    max_ = np.max(col)
    col = (col - min_) / (max_ - min_ + 1e-9)
    df[idx] = pd.Series(col, index=None)

df[df.shape[1] - 1] = df[df.shape[1] - 1].replace({0: -1})
df = df.to_numpy()
df[:, :-1] = df[:, :-1] / np.max(np.linalg.norm(df[:, :-1], axis=1))
df = pd.DataFrame(df, index=None, columns=None)

if not isinstance(df, pd.DataFrame):
    print("Error")
    exit()

print(df.head(5))
df.to_csv("data/park.csv", header=False, index=False)

         0         1         2         3         4         5         6   \
0  0.000000  0.045541  0.027820  0.013544  0.048351  0.061528  0.035945   
1  0.001274  0.049005  0.023456  0.068771  0.062793  0.071295  0.047252   
2  0.002547  0.040780  0.014610  0.065550  0.069229  0.081061  0.056655   
3  0.003821  0.040771  0.018020  0.065281  0.065069  0.081061  0.051656   
4  0.005095  0.039819  0.019992  0.064270  0.087596  0.100594  0.069866   

         7         8         9   ...        14        15        16        17  \
0  0.061177  0.035899  0.077145  ...  0.082179  0.016878  0.126448  0.091215   
1  0.079980  0.047205  0.116846  ...  0.127511  0.014660  0.106886  0.116338   
2  0.091236  0.056685  0.096522  ...  0.109540  0.009784  0.122611  0.099927   
3  0.080245  0.051608  0.102364  ...  0.117486  0.010130  0.122541  0.102853   
4  0.108053  0.069895  0.123410  ...  0.144435  0.013386  0.112550  0.092698   

         18        19        20        21        22   23  
0  0.2372