In [1]:
import h5py
import fsspec
import pandas as pd

### Requesting Full Embeddings from AWS and Splitting Into Train/Valid/Test

`fsspec` and `h5py` are employed to get the full embeddings hosted on AWS, and `sequences.bed` is loaded into a Pandas DataFrame to split the embeddings into train/valid/test sets and save them. Due to large HDF5 files crashing the Jupyter kernel, files of embeddings and labels are partitioned.

In [2]:
index_df = pd.read_table('sequences.bed', names=['chromosome', 'id_1', 'id_2', 'dataset'])
index_df

Unnamed: 0,chromosome,id_1,id_2,dataset
0,chr18,928386,1059458,train
1,chr4,113630947,113762019,train
2,chr11,18427720,18558792,train
3,chr16,85805681,85936753,train
4,chr3,158386188,158517260,train
...,...,...,...,...
38166,chr19,33204702,33335774,test
38167,chr14,41861379,41992451,test
38168,chr19,30681544,30812616,test
38169,chr14,61473198,61604270,test


In [3]:
train_indices = index_df[index_df['dataset'] == 'train'].index.to_numpy()
valid_indices = index_df[index_df['dataset'] == 'valid'].index.to_numpy()
test_indices = index_df[index_df['dataset'] == 'test'].index.to_numpy()

In [4]:
train_indices, valid_indices, test_indices

(array([    0,     1,     2, ..., 34018, 34019, 34020]),
 array([34021, 34022, 34023, ..., 36231, 36232, 36233]),
 array([36234, 36235, 36236, ..., 38168, 38169, 38170]))

In [5]:
len(train_indices), len(valid_indices), len(test_indices)

(34021, 2213, 1937)

In [6]:
# AWS_url = 'https://cs282-datasets.s3.us-west-1.amazonaws.com/embeddings.h5'
# remote_f = fsspec.open(AWS_url, mode='rb')

# if hasattr(remote_f, 'open'):
#     remote_f = remote_f.open()

# f = h5py.File(remote_f)
# data = f['embeddings']

# print("Keys: ", f.keys())
# print("Shape: ", data.shape)
# print("First element: ", data[0])

#### Partitioning Embeddings

In [7]:
# Save data into chunks of 100 points.
# There will be 341 chunks for the train set (340*100 + 1*21 = 34021).
# for i in range(341):
#     if i < 340:
#         indices = train_indices[100*i:100*(i+1)]
#     else:
#         indices = train_indices[100*i:]
#     train_chunk = data[indices]
#     train_embeds_f = h5py.File(f'./data/embeds/train_chunk_X{i+1}.h5', 'w')
#     train_embeds_f.create_dataset('embeddings', data=train_chunk)
#     train_embeds_f.close()
#     print(f"Saved chunk {i+1}!")

In [8]:
# There will be 23 chunks for the validation set (22*100 + 1*13 = 2213).
# for j in range(23):
#     if j < 22:
#         indices = valid_indices[100*j:100*(j+1)]
#     else:
#         indices = valid_indices[100*j:]
#     valid_chunk = data[indices]
#     valid_embeds_f = h5py.File(f'./data/embeds/valid_chunk_X{j+1}.h5', 'w')
#     valid_embeds_f.create_dataset('embeddings', data=valid_chunk)
#     valid_embeds_f.close()
#     print(f"Saved chunk {j+1}!")

In [9]:
# There will be 20 chunks for the test set (19*100 + 1*37 = 1937).
# for k in range(20):
#     if k < 19:
#         indices = test_indices[100*k:100*(k+1)]
#     else:
#         indices = test_indices[100*k:]
#     test_chunk = data[indices]
#     test_embeds_f = h5py.File(f'./data/embeds/test_chunk_X{k+1}.h5', 'w')
#     test_embeds_f.create_dataset('embeddings', data=test_chunk)
#     test_embeds_f.close()
#     print(f"Saved chunk {k+1}!")

#### Partitioning Labels

In [10]:
# full_labels_f = h5py.File('dataset_14-lmnb1_4-cpg.h5', 'r')
# labels = full_labels_f['128bp_bins']
# labels.shape

In [11]:
# There will be 341 chunks for the train set (340*100 + 1*21 = 34021).
# for i in range(341):
#     if i < 340:
#         indices = train_indices[100*i:100*(i+1)]
#     else:
#         indices = train_indices[100*i:]
#     train_chunk = labels[indices]
#     train_labels_f = h5py.File(f'./data/labels/train_chunk_y{i+1}.h5', 'w')
#     train_labels_f.create_dataset('128bp_bins', data=train_chunk)
#     train_labels_f.close()
#     print(f"Saved chunk {i+1}!")

In [12]:
# There will be 23 chunks for the validation set (22*100 + 1*13 = 2213).
# for j in range(23):
#     if j < 22:
#         indices = valid_indices[100*j:100*(j+1)]
#     else:
#         indices = valid_indices[100*j:]
#     valid_chunk = labels[indices]
#     valid_labels_f = h5py.File(f'./data/labels/valid_chunk_y{j+1}.h5', 'w')
#     valid_labels_f.create_dataset('128bp_bins', data=valid_chunk)
#     valid_labels_f.close()
#     print(f"Saved chunk {j+1}!")

In [13]:
# There will be 20 chunks for the test set (19*100 + 1*37 = 1937).
# for k in range(20):
#     if k < 19:
#         indices = test_indices[100*k:100*(k+1)]
#     else:
#         indices = test_indices[100*k:]
#     test_chunk = labels[indices]
#     test_labels_f = h5py.File(f'./data/labels/test_chunk_y{k+1}.h5', 'w')
#     test_labels_f.create_dataset('128bp_bins', data=test_chunk)
#     test_labels_f.close()
#     print(f"Saved chunk {k+1}!")