In [4]:
import os
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.model_selection import train_test_split

### Loading raw data

In [3]:
labels = np.load('data/raw/labels_ipf_logfc.npy')
print(labels.shape)

(2901,)


In [5]:
gene_biobert_embs = np.load('data/raw/gene_biobert_embs.npy')
print(gene_biobert_embs.shape)

(2901, 768)


In [6]:
gene_exprs = np.load('data/raw/gene_exprs.npy')
print(gene_exprs.shape)

(2901, 160)


In [7]:
gene_feats = np.load('data/raw/gene_feats.npy')
print(gene_feats.shape)

(2901, 70)


In [8]:
gene_symbols = np.load('data/raw/gene_nodes.npy')
print(gene_symbols.shape)

(2901,)


In [9]:
# combined info 
gene_symbols = np.expand_dims(gene_symbols, axis=1)
labels = np.expand_dims(labels, axis=1)
gene_info = np.concatenate([gene_symbols, gene_feats, gene_exprs, gene_biobert_embs, labels], axis=1)
print(gene_info.shape)

(2901, 1000)


### train-test splits 

In [10]:
# generating train-test splits 
train_idx, val_idx = train_test_split(range(gene_symbols.shape[0]), test_size=0.4)
val_idx, test_idx = train_test_split(val_idx, test_size=0.5)
print(f'Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}')
print(np.sum(np.in1d(train_idx, val_idx)), 
      np.sum(np.in1d(train_idx, test_idx)), 
      np.sum(np.in1d(test_idx, val_idx)))

Train: 1740, Val: 580, Test: 581
0 0 0


In [15]:
# saving full data 
outdir = 'data/processed/'
train_data = gene_info[train_idx]
train_data = np.concatenate([np.expand_dims(np.array(train_idx), axis=1), train_data], axis=1) # appending index
np.save(os.path.join(outdir, 'train.npy'), train_data, allow_pickle=True)

val_data = gene_info[val_idx]
val_data = np.concatenate([np.expand_dims(np.array(val_idx), axis=1), val_data], axis=1) # appending index
np.save(os.path.join(outdir, 'val.npy'), val_data, allow_pickle=True)

test_data = gene_info[test_idx]
test_data = np.concatenate([np.expand_dims(np.array(test_idx), axis=1), test_data], axis=1) # appending index
np.save(os.path.join(outdir, 'test.npy'), test_data, allow_pickle=True)

In [12]:
# generating sample data 
_, train_sample_idx = train_test_split(train_idx, test_size=0.20)
_, val_sample_idx = train_test_split(val_idx, test_size=0.20)
_, test_sample_idx = train_test_split(test_idx, test_size=0.20)
print(f'Train (sample): {len(train_sample_idx)}, Val (sample): {len(val_sample_idx)}'
      f'Test (sample): {len(test_sample_idx)}')

Train (sample): 348, Val (sample): 116Test (sample): 117


In [22]:
# saving sample data 
outdir = 'data/processed/'
train_data = gene_info[train_sample_idx]
train_data = np.concatenate([np.expand_dims(np.array(train_sample_idx), axis=1), train_data], axis=1)
np.save(os.path.join(outdir, 'train_sample.npy'), train_data, allow_pickle=True)

val_data = gene_info[val_sample_idx]
val_data = np.concatenate([np.expand_dims(np.array(val_sample_idx), axis=1), val_data], axis=1) # appending index
np.save(os.path.join(outdir, 'val_sample.npy'), val_data, allow_pickle=True)

test_data = gene_info[test_sample_idx]
test_data = np.concatenate([np.expand_dims(np.array(test_sample_idx), axis=1), test_data], axis=1) # appending index
np.save(os.path.join(outdir, 'test_sample.npy'), test_data, allow_pickle=True)