In [1]:
import ucimlrepo as uci
import numpy as np
import tqdm
import pickle

# Glass Dataset

In [2]:
# getting glass dataset
data = uci.fetch_ucirepo(id=42)

X = data.data.features.values

class_maps = {item: i for i, item in enumerate(sorted(data.data.targets["Type_of_glass"].unique()))}
y = data.data.targets.values[:, 0]
for i, item in enumerate(y):
    y[i] = class_maps[item]
n_classes = len(class_maps)

In [3]:
proportion = [0.7, 0.1, 0.2]
train_X, train_y = [], []
val_X, val_y = [], []
test_X, test_y = [], []

for c in range(n_classes):
    class_X = X[y == c]
    class_y = y[y == c]
    n = len(class_X)
    train_index = np.random.permutation(n)
    val_index = train_index[int(n * proportion[0]):int(n * (proportion[0] + proportion[1]))]
    test_index = train_index[int(n * (proportion[0] + proportion[1])):]
    train_index = train_index[:int(n * proportion[0])]
    train_X.append(class_X[train_index])
    train_y.append(class_y[train_index])
    val_X.append(class_X[val_index])
    val_y.append(class_y[val_index])
    test_X.append(class_X[test_index])
    test_y.append(class_y[test_index])
    
train_X = np.concatenate(train_X, axis=0)
train_y = np.concatenate(train_y, axis=0)
train_max, train_min = train_X.max(axis=0, keepdims=True), train_X.min(axis=0, keepdims=True)
train_X = (train_X - train_min) / (train_max - train_min + 1e-12)
val_X = np.concatenate(val_X, axis=0)
val_y = np.concatenate(val_y, axis=0)
val_X = (val_X - train_min) / (train_max - train_min + 1e-12)
test_X = np.concatenate(test_X, axis=0)
test_y = np.concatenate(test_y, axis=0)
test_X = (test_X - train_min) / (train_max - train_min + 1e-12)


In [4]:
print(len(train_X), len(val_X), len(test_X), n_classes, test_X[-1].shape)

148 20 46 6 (9,)


In [5]:
def get_uniform_probs(num_classes):
    probs = np.ones(num_classes)

    probs = probs / probs.sum()

    return probs

def one_hot(arr, num_classes):
    out = np.zeros(num_classes)
    out[arr] = 1
    return out

def save_obj(obj, name):
    with open(name + ".pkl", "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def create_bandit_dataset(data, store_folder, num_classes, num_sample=1):

    print("Pre-processing for num sample = " + str(num_sample))
    train, val, test = data
    train_X, train_y = train
    val_X, val_y = val
    test_X, test_y = test
    final_x, final_y, final_actions, final_prop, final_labeled = [], [], [], [], []

    for epoch in range(num_sample):
        print(range(train_X.shape[0]))
        
        for point_num in range(train_X.shape[0]):
            features = train_X[point_num]
            label = train_y[point_num]

            probs = get_uniform_probs(num_classes=num_classes)
            u = probs.astype(np.float64)
            actionvec = np.random.multinomial(1, u / np.sum(u))
            action = np.argmax(actionvec)

            final_x.append(features)
            final_actions.append([action])
            final_prop.append(probs)
            final_labeled.append(np.array([1.0]))

            final_y.append(one_hot(label, num_classes))

    train = np.concatenate(
        (final_x, final_y, final_prop, final_actions, final_labeled), axis=1
    )
    train = train[np.random.permutation(len(train))]
    final_x, final_y, final_actions, final_prop, final_labeled = [], [], [], [], []
        
    for point_num in range(val_X.shape[0]):
        features = val_X[point_num]
        label = val_y[point_num]

        probs = get_uniform_probs(num_classes=num_classes)
        u = probs.astype(np.float64)
        actionvec = np.random.multinomial(1, u / np.sum(u))
        action = np.argmax(actionvec)

        final_x.append(features)
        final_actions.append([action])
        final_prop.append(probs)
        final_labeled.append(np.array([1.0]))

        final_y.append(one_hot(label, num_classes))

    val = np.concatenate(
        (final_x, final_y, final_prop, final_actions, final_labeled), axis=1
    )
    val = val[np.random.permutation(len(val))]

    final_x, final_y, final_actions, final_prop, final_labeled = [], [], [], [], []
    for point_num in range(test_X.shape[0]):
        features = test_X[point_num]
        label = test_y[point_num]

        probs = get_uniform_probs(num_classes=num_classes)
        u = probs.astype(np.float64)
        actionvec = np.random.multinomial(1, u / np.sum(u))
        action = np.argmax(actionvec)

        final_x.append(features)
        final_actions.append([action])
        final_prop.append(probs)

        final_y.append(one_hot(label, num_classes))

    test = np.concatenate(
        (final_x, final_y, final_prop, final_actions), axis=1
    )
    test = test[np.random.permutation(len(val))]

    filename = f"data/{store_folder}/bandit_data_"
    filename += "sampled_" + str(num_sample)
    print("file name = ", filename)
    
    save_obj(train, filename + "_train")
    save_obj(test, filename + "_test")
    save_obj(val, filename + "_val")        


In [6]:
create_bandit_dataset(((train_X, train_y), (val_X, val_y), (test_X, test_y)), 'glass/base', n_classes)

Pre-processing for num sample = 1
range(0, 148)
file name =  data/glass/base/bandit_data_sampled_1


# Letter Dataset

In [7]:
# getting glass dataset
data = uci.fetch_ucirepo(id=59)

X = data.data.features.values

class_maps = {item: i for i, item in enumerate(sorted(data.data.targets["lettr"].unique()))}
y = data.data.targets.values[:, 0]
for i, item in enumerate(y):
    y[i] = class_maps[item]
n_classes = len(class_maps)

In [8]:
proportion = [0.7, 0.1, 0.2]
train_X, train_y = [], []
val_X, val_y = [], []
test_X, test_y = [], []

for c in range(n_classes):
    class_X = X[y == c]
    class_y = y[y == c]
    n = len(class_X)
    train_index = np.random.permutation(n)
    val_index = train_index[int(n * proportion[0]):int(n * (proportion[0] + proportion[1]))]
    test_index = train_index[int(n * (proportion[0] + proportion[1])):]
    train_index = train_index[:int(n * proportion[0])]
    train_X.append(class_X[train_index])
    train_y.append(class_y[train_index])
    val_X.append(class_X[val_index])
    val_y.append(class_y[val_index])
    test_X.append(class_X[test_index])
    test_y.append(class_y[test_index])
    
train_X = np.concatenate(train_X, axis=0)
train_y = np.concatenate(train_y, axis=0)
# train_max, train_min = train_X.max(axis=0, keepdims=True), train_X.min(axis=0, keepdims=True)
# train_X = (train_X - train_min) / (train_max - train_min + 1e-12)
train_X = train_X / 15
val_X = np.concatenate(val_X, axis=0)
val_y = np.concatenate(val_y, axis=0)
val_X = val_X / 15
test_X = np.concatenate(test_X, axis=0)
test_y = np.concatenate(test_y, axis=0)
test_X = test_X / 15


In [9]:
print(len(train_X), len(val_X), len(test_X), n_classes, test_X[-1].shape)

13989 2000 4011 26 (16,)


In [10]:
create_bandit_dataset(((train_X, train_y), (val_X, val_y), (test_X, test_y)), 'letter/base', n_classes)

Pre-processing for num sample = 1
range(0, 13989)
file name =  data/letter/base/bandit_data_sampled_1


# Optdigits dataset

In [11]:
# getting glass dataset
data = uci.fetch_ucirepo(id=80)

X = data.data.features.values

class_maps = {item: i for i, item in enumerate(sorted(data.data.targets["class"].unique()))}
y = data.data.targets.values[:, 0]
for i, item in enumerate(y):
    y[i] = class_maps[item]
n_classes = len(class_maps)

In [12]:
n_classes, X.shape

(10, (5620, 64))

In [13]:
proportion = [0.7, 0.1, 0.2]
train_X, train_y = [], []
val_X, val_y = [], []
test_X, test_y = [], []

for c in range(n_classes):
    class_X = X[y == c]
    class_y = y[y == c]
    n = len(class_X)
    train_index = np.random.permutation(n)
    val_index = train_index[int(n * proportion[0]):int(n * (proportion[0] + proportion[1]))]
    test_index = train_index[int(n * (proportion[0] + proportion[1])):]
    train_index = train_index[:int(n * proportion[0])]
    train_X.append(class_X[train_index])
    train_y.append(class_y[train_index])
    val_X.append(class_X[val_index])
    val_y.append(class_y[val_index])
    test_X.append(class_X[test_index])
    test_y.append(class_y[test_index])
    
train_X = np.concatenate(train_X, axis=0)
train_y = np.concatenate(train_y, axis=0)
train_max, train_min = train_X.max(axis=0, keepdims=True), train_X.min(axis=0, keepdims=True)
train_X = (train_X - train_min) / (train_max - train_min + 1e-12)
val_X = np.concatenate(val_X, axis=0)
val_y = np.concatenate(val_y, axis=0)
val_X = (val_X - train_min) / (train_max - train_min + 1e-12)
test_X = np.concatenate(test_X, axis=0)
test_y = np.concatenate(test_y, axis=0)
test_X = (test_X - train_min) / (train_max - train_min + 1e-12)


In [14]:
print(len(train_X), len(val_X), len(test_X), n_classes, test_X[-1].shape)

3928 563 1129 10 (64,)


In [15]:
create_bandit_dataset(((train_X, train_y), (val_X, val_y), (test_X, test_y)), 'optdigits/base', n_classes)

Pre-processing for num sample = 1
range(0, 3928)
file name =  data/optdigits/base/bandit_data_sampled_1


# Yeast Dataset

In [16]:
# getting glass dataset
data = uci.fetch_ucirepo(id=110)

X = data.data.features.values

class_maps = {item: i for i, item in enumerate(sorted(data.data.targets["localization_site"].unique()))}
y = data.data.targets.values[:, 0]
for i, item in enumerate(y):
    y[i] = class_maps[item]
n_classes = len(class_maps)

In [17]:
n_classes, X.shape

(10, (1484, 8))

In [18]:
proportion = [0.7, 0.1, 0.2]
train_X, train_y = [], []
val_X, val_y = [], []
test_X, test_y = [], []

for c in range(n_classes):
    class_X = X[y == c]
    class_y = y[y == c]
    n = len(class_X)
    train_index = np.random.permutation(n)
    val_index = train_index[int(n * proportion[0]):int(n * (proportion[0] + proportion[1]))]
    test_index = train_index[int(n * (proportion[0] + proportion[1])):]
    train_index = train_index[:int(n * proportion[0])]
    train_X.append(class_X[train_index])
    train_y.append(class_y[train_index])
    val_X.append(class_X[val_index])
    val_y.append(class_y[val_index])
    test_X.append(class_X[test_index])
    test_y.append(class_y[test_index])
    
train_X = np.concatenate(train_X, axis=0)
train_y = np.concatenate(train_y, axis=0)
train_max, train_min = train_X.max(axis=0, keepdims=True), train_X.min(axis=0, keepdims=True)
train_X = (train_X - train_min) / (train_max - train_min + 1e-12)
val_X = np.concatenate(val_X, axis=0)
val_y = np.concatenate(val_y, axis=0)
val_X = (val_X - train_min) / (train_max - train_min + 1e-12)
test_X = np.concatenate(test_X, axis=0)
test_y = np.concatenate(test_y, axis=0)
test_X = (test_X - train_min) / (train_max - train_min + 1e-12)


In [19]:
print(len(train_X), len(val_X), len(test_X), n_classes, test_X[-1].shape)

1035 146 303 10 (8,)


In [20]:
create_bandit_dataset(((train_X, train_y), (val_X, val_y), (test_X, test_y)), 'yeast/base', n_classes)

Pre-processing for num sample = 1
range(0, 1035)
file name =  data/yeast/base/bandit_data_sampled_1
