# Imports

In [1]:
import os, sys
import pandas as pd
import numpy as np

from Bio import SeqIO

import tensorflow as tf

## Set var

In [2]:
FASTA_STRING = ">{} {}\n{}\n"

BASE_DIR = "../../data/Combined_data"
pub = "published" # un-published is all

path_COG = "{}/{}/OGS".format(BASE_DIR, pub)

raw_fasta  = "{}/{}/OGT_IMG_published_cd.fasta".format(BASE_DIR, pub)

test_fasta =  ["COG0039.fasta" , "COG2032.fasta"] # COGs that will be included in test set

train_set = "{}/{}/train.fasta".format(BASE_DIR, pub)
test_set = "{}/{}/test.fasta".format(BASE_DIR, pub)

NAME_FORMAT = "{}_{}_{}.fasta"

n_groups = 6

# Split data in train and test


In [2]:

ids_test = []


# Write test set
with open(test_set, "w") as file_writer:
    for cog in test_fasta:
        for rec in SeqIO.parse(os.path.join(path_COG, cog), "fasta"):
            file_writer.write(FASTA_STRING.format(rec.id, rec.description.split()[-1], rec.seq))
            ids_test.append(rec.id)
            
# Write train set
with open(train_set, "w") as file_writer:
    for rec in SeqIO.parse(raw_fasta,"fasta"):
        if rec.id not in ids_test:
            file_writer.write(FASTA_STRING.format(rec.id, rec.description.split()[-1], rec.seq))

# Make ranged sets

## fasta

In [24]:
# Load all data to get temp range

data_train_all = {}
temp_train_all = []
for i, rec in enumerate(SeqIO.parse(train_set, "fasta")):
    data_train_all[rec.id] = (rec.description.split()[-1], rec.seq)
    temp_train_all.append(float(rec.description.split()[-1]))

n_sequences_all = i
temp_train_all = np.array(temp_train_all)
min_temp = min(temp_train_all)
max_temp = max(temp_train_all)

ranges = np.linspace(min_temp, max_temp, n_groups)   
up_sample = [n_sequences_all // (np.sum(temp_train_all < ranges[i+1]) - np.sum(temp_train_all < ranges[i])) for i in range(n_groups-1)]


file_writers = []
for i in range(n_groups-1):
    temp_low = int(ranges[i])
    temp_high = int(ranges[i+1])
    upsample = int(up_sample[i])
    name = os.path.join(BASE_DIR, pub, "Groups_5/FASTA", NAME_FORMAT.format(temp_low,temp_high,upsample))
    file_writers.append(open(name, "w"))

for i, rec in enumerate(SeqIO.parse(raw_fasta, "fasta")):
    temp = float(rec.description.split()[-1])
    idx = np.sum(ranges < temp+0.01)-1
    if idx == n_groups-1:
        idx = n_groups-2
    file_writers[idx].write(FASTA_STRING.format(rec.id, temp, rec.seq))
    

for w in file_writers:
    w.close()

## tfrecords

### Helper functions

In [18]:
def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

aas = 'XACDEFGHIKLMNPQRSTVWY'
table = {aa: i-1 for i, aa in enumerate(aas)}
table['U'] = -1
def to_int(seq): 
    tmp = -np.ones((512,), dtype = np.int64)
    seq = [ table[aa] for aa in str(rec.seq).upper()]
    tmp[:len(seq)] = seq
    return tmp

def parse_ofset(name):
    temp_low = int(name.split('_')[0])
    temp_high= int(name.split('_')[1])
    return temp_low + (temp_high - temp_low)/2

### Load files

In [9]:
dir_ = os.path.join(BASE_DIR, pub, "Groups_5/FASTA")
data_list = []
for file in os.listdir(dir_):
    if file[0] == '.':
        continue
    data = {"id": [], "seq": [], "temp": []}
    for rec in SeqIO.parse(os.path.join(dir_, file), "fasta"):
        data["id"].append(rec.id)
        data["seq"].append(to_int( str(rec.seq).upper()))
        data["temp"].append(float(rec.description.split()[-1]))
    data_list.append((data, file.split('.')[0]))


### Write records

In [19]:

for data in data_list:    
    name = os.path.join(BASE_DIR, pub, "Groups_5/RECORDS", data[1]+".tfrecord")
    ofset = parse_ofset(data[1])
    with tf.io.TFRecordWriter(name) as tfrecord:
        for i in range(len(data[0]["id"])):
            features = {
              'temp': _float_feature(data[0]["temp"][i]-ofset),
              'seq': _int64_feature(data[0]["seq"][i])
              }
            element = tf.train.Example(features = tf.train.Features(feature=features))
            tfrecord.write(element.SerializeToString())

## Decoding

In [8]:
sys.path.append("../../utils/")

from preprocessing import load_data_class, load_data_reg

config ={
    "base_dir": os.path.join(BASE_DIR,pub,"Groups_5", "RECORDS"),
    "file_in": "82_103_105.tfrecord",
    "file_out": ["22_42_1.tfrecord", "42_62_13.tfrecord", "62_82_34.tfrecord", "82_103_105.tfrecord"],
    "chards": 2
}

data_train, data_val = load_data_class(config)
for d in data_train[0].skip(int(495)).take(10):
    print(d[0][-1,:])

data_train, data_val = load_data_reg(config)
for d in data_val.take(10):
    print(d[1])
# Print 

tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(21,), dtype=float32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

92.5
12.0
32.0
52.0
72.0


In [12]:
print(os.listdir("../../data/Combined_data/published/"))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/Combined_data/Groups_5/RECORDS'