In [1]:
import torch
import numpy as np
import random
import pickle
import h5py # for .jld2 files
import os

In [2]:
###
#  Load Data
###
file = h5py.File("targets_and_normalized_features.jld2", "r")

# feature matrix
X = torch.from_numpy(np.transpose(file["X"][:]))
# simulation data
y = [torch.from_numpy(np.transpose(file["henry_y"][:])), 
     torch.from_numpy(np.transpose(file["gcmc_y"][:]))]
# associated simulation costs
cost = [np.transpose(file["henry_total_elapsed_time"][:]), 
        np.transpose(file["gcmc_elapsed_time"][:])]

# total number of COFs in data set
nb_COFs = X.shape[0] 

print("raw data - \n\tX:", X.shape)
for f in range(2):
    print("\tfidelity:", f)
    print("\t\ty:", y[f].shape)
    print("\t\tcost: ", cost[f].shape)
    
print("\nEnsure features are normalized - ")
print("max:\n", torch.max(X, 0).values)
print("min:\n", torch.min(X, 0).values)
print("width:\n",torch.max(X, 0).values - torch.min(X, 0).values)

raw data - 
	X: torch.Size([608, 14])
	fidelity: 0
		y: torch.Size([608])
		cost:  (608,)
	fidelity: 1
		y: torch.Size([608])
		cost:  (608,)

Ensure features are normalized - 
max:
 tensor([0.7144, 0.4136, 0.4696, 0.6677, 0.9579, 0.8383, 0.3595, 0.3207, 0.9938,
        0.8242, 0.9692, 0.9869, 0.9868, 0.9762], dtype=torch.float64)
min:
 tensor([-0.2856, -0.5864, -0.5304, -0.3323, -0.0421, -0.1617, -0.6405, -0.6793,
        -0.0062, -0.1758, -0.0308, -0.0131, -0.0132, -0.0238],
       dtype=torch.float64)
width:
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000], dtype=torch.float64)


In [3]:
###
#  helper functions
###

# find COF closest to the center of feature space
def get_initializing_COF(X):
    # center of feature space
    data_center = np.array([X[:, i].mean() for i in range(X.size()[1])])
    # max possible distance between normalized features
    return np.argmin(np.linalg.norm(X - data_center, axis=1))

# find COFs farthest away from a specified point
def diverse_set(X, seed_cof, train_size):
    # initialize with one random point; pick others in a max diverse fashion
    ids_train = [seed_cof]
    # select remaining training points
    for j in range(train_size - 1):
        # for each point in data set, compute its min dist to training set
        dist_to_train_set = np.linalg.norm(X - X[ids_train, None, :], axis=2)
        assert np.shape(dist_to_train_set) == (len(ids_train), nb_COFs)
        min_dist_to_a_training_pt = np.min(dist_to_train_set, axis=0)
        assert np.size(min_dist_to_a_training_pt) == nb_COFs
        
        # acquire point with max(min distance to train set) i.e. Furthest from train set
        ids_train.append(np.argmax(min_dist_to_a_training_pt))
    assert np.size(np.unique(ids_train)) == train_size # must be unique
    return np.array(ids_train)

In [4]:
###
#  number of initializing sets to generate
###
nb_runs = 100

###
#  number of COFs in each initializing set
###
nb_init = 3

###
#  list of COF IDs to sample
###
cof_ids_to_sample = list(range(nb_COFs))

###
#  identify the COF at the center of data space
###
central_cof = get_initializing_COF(X)
# remove this COF ID from sample set 
cof_ids_to_sample.pop(central_cof)

###
#  randomly select the rest without replacement
###
seed_cofs = random.sample(cof_ids_to_sample, nb_runs-1)


###
#  generate initializing sets using max diversity 
###
init_cof_ids = []

for i in range(nb_runs):
    if i == 0:
        cof_id = central_cof
    else:
        cof_id = seed_cofs[i-1]
    # get diverse set
    div_set = diverse_set(X, cof_id, nb_init)
    init_cof_ids.append(div_set)
    
init_cof_ids[:15]

[array([112, 522,  45]),
 array([508,  45,  73]),
 array([237, 176,  73]),
 array([308,  45,  73]),
 array([539,  71, 522]),
 array([411, 522,  73]),
 array([ 38,  45, 522]),
 array([ 71, 494,  98]),
 array([80, 73, 45]),
 array([259, 522,  73]),
 array([172,  98, 522]),
 array([423, 522,  45]),
 array([ 74,  45, 522]),
 array([ 89,  45, 522]),
 array([575, 522,  45])]

In [5]:
###
#  save data
###
initializing_cof_ids = dict({'init_cof_ids': init_cof_ids
                })

with open('search_results/initializing_cof_ids.pkl', 'wb') as file:
    pickle.dump(initializing_cof_ids, file)