In [1]:
import warnings
warnings.simplefilter(action='ignore')
from Bio import SeqIO
import numpy as np
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
import tensorflow as tf
import keras.backend as K
import keras
import os
from keras.layers import Dense, Flatten, Conv2D, Conv1D, MaxPooling2D, Embedding, Input, Dropout, Reshape, Activation
from keras.models import Sequential, load_model
from keras.utils import multi_gpu_model 
import matplotlib.pylab as plt
from skimage.transform import resize
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from tqdm import tnrange
tf.logging.set_verbosity(tf.logging.ERROR)

Using TensorFlow backend.


In [2]:
def transform(data): #transform sequence into one hot encoding
    def one_hot_encode(seq):
        mapping = dict(zip("acgnt", range(5)))     
        seq2 = [mapping[i] if i in ['a', 't', 'c', 'g', 'n'] else mapping['n'] for i in seq]
        return np.eye(5)[seq2]
    read_size = 150
    seq = one_hot_encode(data[0])
    seq = np.expand_dims(seq, axis = -1)
    return seq

def read_data(root_dir, file):
    viral = []
    hum = []
    file_path = os.path.join(root_dir, file)
    for seq_record in SeqIO.parse(file_path, "fasta"):
        if('v' in seq_record.id):
            viral.append(seq_record.seq)
        if('chr1' in seq_record.id):
            hum.append(seq_record.seq)
    return viral, hum

In [3]:
root_dir = '/home/ecvol/data/viral/'

### Get training reads from viral sequences

In [None]:
train_seq= []
for idx, seq in enumerate(SeqIO.parse(root_dir + "hpv.fas", "fasta")):
    train_seq.append(seq.seq)

############################################################################################
smallest_len = min(len(i) for i in train_seq)
num_reads = 2000
read_length = 150
rand_reads = np.random.randint(low = 0, high = smallest_len-152, size = num_reads)
train_set = np.empty([len(train_seq)*num_reads,read_length, 5, 1], np.int8)
train_id = 0
for idx, seq in tqdm(enumerate(train_seq), total = len(train_seq)):
    for read_start in rand_reads:
        read = np.array(seq[read_start:read_start+read_length])
        train_set[train_id] = transform(np.expand_dims(read, axis=0))
        train_id += 1
############################################################################################
np.save(root_dir + 'v_ref_reads', train_set) #when loading, call np.random.shuffle()

### Get training and test reads from human sequences

In [None]:
for idx, seq in enumerate(SeqIO.parse("./data/hg19full.fa", "fasta")):
    chr_seq = seq.seq
    break

############################################################################################
train_seq = chr_seq
num_reads = 2000000
read_length = 150
smallest_len = len(train_seq)
rand_reads = np.random.randint(low = 0, high = smallest_len-152, size = num_reads)
myfile = open('text.txt', 'w')
for idx, read_start in tqdm(enumerate(rand_reads), total=num_reads):
    read = str(train_seq[read_start:read_start+read_length]) + '\n'
    myfile.write(read)
myfile.close()

############################################################################################
with open('text.txt') as f:
    content = f.readlines()
content = [x.strip().lower() for x in content] 

############################################################################################
train_set = np.empty([len(content), read_length, 5, 1], np.int8)
for idx, contents in tqdm(enumerate(content), total=len(content)):
    train_set[idx] = transform(np.expand_dims((np.array(list(contents))), axis=0))
    
np.save('./data/h_ref_reads', train_set) #already shuffled

### Get test reads from viral sequence

In [95]:
viral_seq = 0
for idx, seq in enumerate(SeqIO.parse("./data/agpv1.fa", "fasta")):
    viral_seq = seq.seq
    
train_seq = viral_seq
num_reads = 10000
read_length = 150
smallest_len = len(train_seq)
rand_reads = np.random.randint(low = 0, high = smallest_len-152, size = num_reads)
myfile = open('text.txt', 'w')
for idx, read_start in tqdm(enumerate(rand_reads), total=num_reads):
    read = str(train_seq[read_start:read_start+read_length]) + '\n'
    myfile.write(read)
myfile.close()
with open('text.txt') as f:
    content = f.readlines()
content = [x.strip().lower() for x in content] 
train_set = np.empty([len(content), read_length, 5, 1], np.int8)
for idx, contents in tqdm(enumerate(content), total=len(content)):
    train_set[idx] = transform(np.expand_dims((np.array(list(contents))), axis=0))

############################################################################################
np.save('./data/v_ref_reads_test', train_set) #already shuffled

7722

### Create test and training set

#### 1 = virus, 0 human

In [4]:
v_ref_reads = np.load(root_dir + 'v_ref_reads.npy')
np.random.shuffle(v_ref_reads)
h_ref_reads = np.load(root_dir + 'h_ref_reads.npy')

hum_test = h_ref_reads[:1000000]
viral_test = np.load(root_dir + 'v_ref_reads_test.npy')
X_test = np.concatenate((viral_test, hum_test), axis=0) # 1 = virus, 0 human
Y_test = np.array([*np.ones(viral_test.shape[0]), *np.zeros(hum_test.shape[0])])
np.save(root_dir+'test_set.npy', X_test)
np.save(root_dir+'test_set_label.npy', Y_test)

hum_train = h_ref_reads[1000000:]
X_train = np.array([*v_ref_reads, *hum_train])
Y_train = np.array([*np.ones(v_ref_reads.shape[0]), *np.zeros(hum_train.shape[0])])
np.save(root_dir+'train_set.npy', X_train)
np.save(root_dir+'train_set_label.npy', Y_train)

print("X_train shape: ", X_train.shape, "Y_train shape: ", Y_train.shape, "X_test.shape", X_test.shape)

X_train shape:  (1674000, 150, 5, 1) Y_train shape:  (1674000,) X_test.shape (1010000, 150, 5, 1)


### Generate training set for integrated sequences

In [38]:
num_vreads = v_ref_reads.shape[0]
num_hreads = hum_train.shape[0]
X2_train = np.empty([1000000, 150, 5, 1])
Y2_train = np.empty([1000000,150])
num_train = 1000000
pos = np.random.randint(low = 0, high = 149, size= num_train)
vreads = np.random.randint(low = 0, high = num_vreads, size=num_train)
hreads = np.random.randint(low = 0, high = num_hreads, size=num_train)
p_hstart = np.random.randint(low = 0, high = 2, size=num_train) #probability that the read starts with human seq
for i in range(num_train):
    vread = v_ref_reads[vreads[i]]
    hread = hum_train[hreads[i]]
    GT = np.zeros(150)
    if(p_hstart[i]):
        X2_train[i] = np.vstack((hread[:pos[i]], vread[pos[i]:]))
        GT[pos[i]:] = 1
    else:
        X2_train[i] = np.vstack((vread[:pos[i]], hread[pos[i]:]))
        GT[:pos[i]] = 1
    Y2_train[i] = GT

In [39]:
np.save(root_dir+'integ_train_set.npy', X2_train)
np.save(root_dir+'integ_train_set_label.npy', Y2_train)

In [None]:
print(vread[pos[i]], ' --', hread[pos[i]-2:pos[i]], ' --', X2_train[i][pos[i]-2:pos[i]+1])

### Generate test set for integrated sequences

In [42]:
num_vreads = viral_test.shape[0]
num_hreads = hum_test.shape[0]
X2_test = np.empty([1000000, 150, 5, 1])
Y2_test = np.empty([1000000,150])
num_test = 1000000
pos = np.random.randint(low = 0, high = 149, size= num_train)
vreads = np.random.randint(low = 0, high = num_vreads, size=num_train)
hreads = np.random.randint(low = 0, high = num_hreads, size=num_train)
p_hstart = np.random.randint(low = 0, high = 2, size=num_train) #probability that the read starts with human seq
for i in range(num_train):
    vread = viral_test[vreads[i]]
    hread = hum_test[hreads[i]]
    GT = np.zeros(150)
    if(p_hstart[i]):
        X2_test[i] = np.vstack((hread[:pos[i]], vread[pos[i]:]))
        GT[pos[i]:] = 1
    else:
        X2_test[i] = np.vstack((vread[:pos[i]], hread[pos[i]:]))
        GT[:pos[i]] = 1
    Y2_test[i] = GT

In [43]:
np.save(root_dir+'integ_test_set.npy', X2_test)
np.save(root_dir+'integ_test_set_label.npy', Y2_test)