# Imports

In [1]:
import os, sys
import pandas as pd
import numpy as np

from Bio import SeqIO

import tensorflow as tf

## Set var

In [2]:
FASTA_STRING = ">{} {}\n{}\n"

BASE_DIR = "../../data/Combined_data"
pub = "published" # un-published is all

path_COG = "{}/{}/OGS".format(BASE_DIR, pub)

raw_fasta  = "{}/{}/OGT_IMG_published_cd.fasta".format(BASE_DIR, pub)

test_fasta =  ["COG0039.fasta" , "COG2032.fasta"] # COGs that will be included in test set

train_set = "{}/{}/train.fasta".format(BASE_DIR, pub)
test_set = "{}/{}/test.fasta".format(BASE_DIR, pub)

NAME_FORMAT = "{}_{}_{}.fasta"

n_groups = 6 # one more then the sved groups. If n_groups = 6, five groupos will be stored.

# Split data in train and test


In [2]:

ids_test = []


# Write test set
with open(test_set, "w") as file_writer:
    for cog in test_fasta:
        for rec in SeqIO.parse(os.path.join(path_COG, cog), "fasta"):
            file_writer.write(FASTA_STRING.format(rec.id, rec.description.split()[-1], rec.seq))
            ids_test.append(rec.id)
            
# Write train set
with open(train_set, "w") as file_writer:
    for rec in SeqIO.parse(raw_fasta,"fasta"):
        if rec.id not in ids_test:
            file_writer.write(FASTA_STRING.format(rec.id, rec.description.split()[-1], rec.seq))

# Make ranged sets

## fasta

In [24]:
# Load all data to get temp range

data_train_all = {}
temp_train_all = []
for i, rec in enumerate(SeqIO.parse(train_set, "fasta")):
    data_train_all[rec.id] = (rec.description.split()[-1], rec.seq)
    temp_train_all.append(float(rec.description.split()[-1]))

n_sequences_all = i
temp_train_all = np.array(temp_train_all)
min_temp = np.min(temp_train_all)
max_temp = np.max(temp_train_all)

ranges = np.linspace(min_temp, max_temp, n_groups)   
up_sample = [n_sequences_all // (np.sum(temp_train_all < ranges[i+1]) - np.sum(temp_train_all < ranges[i])) for i in range(n_groups-1)]


file_writers = []
for i in range(n_groups-1):
    temp_low = int(ranges[i])
    temp_high = int(ranges[i+1])
    upsample = int(up_sample[i])
    name = os.path.join(BASE_DIR, pub, "Groups_{}/FASTA".format(n_groups-1), NAME_FORMAT.format(temp_low,temp_high,upsample))
    file_writers.append(open(name, "w"))

for i, rec in enumerate(SeqIO.parse(raw_fasta, "fasta")):
    temp = float(rec.description.split()[-1])
    idx = np.sum(ranges < temp+0.01)-1
    if idx == n_groups-1:
        idx = n_groups-2
    file_writers[idx].write(FASTA_STRING.format(rec.id, temp, rec.seq))
    

for w in file_writers:
    w.close()

## tfrecords

### Helper functions

In [3]:
def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

aas = 'XACDEFGHIKLMNPQRSTVWY'
table = {aa: i-1 for i, aa in enumerate(aas)}
table['U'] = -1
def to_int(seq): 
    tmp = -np.ones((512,), dtype = np.int64)
    seq = [ table[aa] for aa in str(rec.seq).upper()]
    tmp[:len(seq)] = seq
    return tmp

def parse_ofset(name):
    temp_low = int(name.split('_')[0])
    temp_high= int(name.split('_')[1])
    return temp_low + (temp_high - temp_low)/2

## Multiple sets
### Load files

In [77]:
dir_ = os.path.join(BASE_DIR, pub, "Groups_{}/FASTA".format(n_groups-1))
data_list = []
for file in os.listdir(dir_):
    if file[0] == '.':
        continue
    data = {"id": [], "seq": [], "temp": []}
    for rec in SeqIO.parse(os.path.join(dir_, file), "fasta"):
        data["id"].append(rec.id)
        data["seq"].append(to_int( str(rec.seq).upper()))
        data["temp"].append(float(rec.description.split()[-1]))
    data_list.append((data, file.split('.')[0]))


### Write records

In [78]:

for data in data_list:    
    name = os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL".format(n_groups-1), data[1]+".tfrecord")
    #ofset = parse_ofset(data[1]) # local temperature ofset
    ofset = 103 - 2 # Global temperature ofset
    with tf.io.TFRecordWriter(name) as tfrecord:
        for i in range(len(data[0]["id"])):
            features = {
              'temp': _float_feature(data[0]["temp"][i]-ofset),
              'seq': _int64_feature(data[0]["seq"][i])
              }
            element = tf.train.Example(features = tf.train.Features(feature=features))
            tfrecord.write(element.SerializeToString())

## Single sets classifier
### Load files

In [52]:
dir_ = os.path.join(BASE_DIR, pub, "Groups_{}/FASTA".format(n_groups-1))

data_list = {}


temp_range_dict = {}
count_all = 0
for file in os.listdir(dir_):
    count = 0
    if file[0] == '.':
        continue
    data = {}
    for rec in SeqIO.parse(os.path.join(dir_, file), "fasta"):
        data[rec.id] = to_int(str(rec.seq).upper())
        count += 1
    count_all += count * int(file.split('.')[0].split('_')[-1])
    print("{} Sequences in file {}".format(count * int(file.split('.')[0].split('_')[-1]), file))

    data_list[file.split('.')[0]] = (data, file.split('.')[0])
print("{} sequences after upscaling ".format(count_all))
    

temp_range_dict = {}

for file in os.listdir(dir_):
    
    list_data_set = []
    if file[0] == '.':
        continue
        
    raw_file_name = file.split('.')[0]
    for idx, data in enumerate(list(data_list.keys())):
        if data == raw_file_name:
            up_scale = int(data.split('_')[-1]) * (n_groups - 2)
            _class = 1
        else:
            up_scale = int(data.split('_')[-1])
            _class = 0
        for ids in list(data_list[data][0].keys()):
            list_data_set += [(data, ids, _class) for _ in range(up_scale)]
        print("file {} Data {} Upscale {} Class {} Len data scale {} Tot len set".format(file, data, up_scale, _class, len(data_list[data][0].keys())*up_scale), len(list_data_set))
    print("\n")
    temp_range_dict[file] = list_data_set       
    

2658810 Sequences in file 82_103_105.fasta
2604197 Sequences in file 2_22_19.fasta
2223288 Sequences in file 22_42_1.fasta
2509273 Sequences in file 42_62_13.fasta
2576894 Sequences in file 62_82_34.fasta
12572462 sequences after upscaling 
file 82_103_105.fasta Data 82_103_105 Upscale 420 Class 1 Len data scale 10635240 Tot len set 10635240
file 82_103_105.fasta Data 2_22_19 Upscale 19 Class 0 Len data scale 2604197 Tot len set 13239437
file 82_103_105.fasta Data 22_42_1 Upscale 1 Class 0 Len data scale 2223288 Tot len set 15462725
file 82_103_105.fasta Data 42_62_13 Upscale 13 Class 0 Len data scale 2509273 Tot len set 17971998
file 82_103_105.fasta Data 62_82_34 Upscale 34 Class 0 Len data scale 2576894 Tot len set 20548892


file 2_22_19.fasta Data 82_103_105 Upscale 105 Class 0 Len data scale 2658810 Tot len set 2658810
file 2_22_19.fasta Data 2_22_19 Upscale 76 Class 1 Len data scale 10416788 Tot len set 13075598
file 2_22_19.fasta Data 22_42_1 Upscale 1 Class 0 Len data scale 22

### Write record

In [75]:

shards = 100

for key in list(temp_range_dict.keys()):  
    index_item = np.random.choice(len(temp_range_dict[key]), size = (len(temp_range_dict[key]),), replace = False)
    shards_range = np.linspace(0,len(temp_range_dict[key]), num = shards)
    
    name_base = key.split('.')[0]
    if not os.path.isdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_CLASS".format(n_groups-1), name_base)):
        os.mkdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_CLASS".format(n_groups-1), name_base))
    for shard in range(shards-1):
        name = os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_CLASS".format(n_groups-1), name_base, str(shard) +".tfrecord")
        print(name)
        print(shard, shards_range[shard], shards_range[shard+1])
        with tf.io.TFRecordWriter(name) as tfrecord:
            for index in index_item[int(shards_range[shard]):int(shards_range[shard+1])] if shard != shard-2 else index_item[int(shards_range[shard]):]:
                idx, ids, _class = temp_range_dict[key][index]
                features = {
                  'class': _float_feature(_class),
                  'seq': _int64_feature(data_list[idx][0][ids])
                  }
                element = tf.train.Example(features = tf.train.Features(feature=features))
                tfrecord.write(element.SerializeToString())

../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/0.tfrecord
0 0.0 207564.56565656565
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/1.tfrecord
1 207564.56565656565 415129.1313131313
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/2.tfrecord
2 415129.1313131313 622693.696969697
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/3.tfrecord
3 622693.696969697 830258.2626262626
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/4.tfrecord
4 830258.2626262626 1037822.8282828282
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/5.tfrecord
5 1037822.8282828282 1245387.393939394
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/6.tfrecord
6 1245387.393939394 1452951.9595959596
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_105/7.tfrecord
7 1452951.9595959596 1660516.5252525252
../../data/Combined_data/published/Groups_5/RECORDS_CLASS/82_103_

## Global Reg

### Load data 

In [11]:
VAL_MOD = 200 
dir_ = os.path.join(BASE_DIR, pub, "Groups_{}/FASTA".format(n_groups-1))
data_list_train = {}
count_all_train = 0
data_list_val = {}
count_all_val = 0
for file in os.listdir(dir_):
    count_train = 0
    count_val = 0
    if file[0] == '.':
        continue
    data_train = {}
    data_val = {}
    for mod, rec in enumerate(SeqIO.parse(os.path.join(dir_, file), "fasta")):
        if mod%VAL_MOD != 0:
            data_train[rec.id] = (to_int(str(rec.seq).upper()), rec.description.split()[-1])
            count_train += 1
        else:
            data_val[rec.id] = (to_int(str(rec.seq).upper()), rec.description.split()[-1])
            count_val += 1
    count_all_train += count_train * int(file.split('.')[0].split('_')[-1])
    count_all_val += count_val * int(file.split('.')[0].split('_')[-1])
    print("{} Train Sequences in file {}".format(count_train * int(file.split('.')[0].split('_')[-1]), file))
    print("{} Val Sequences in file {}".format(count_val * int(file.split('.')[0].split('_')[-1]), file))
    data_list_train[file.split('.')[0]] = (data_train, file.split('.')[0])
    data_list_val[file.split('.')[0]] = (data_val, file.split('.')[0])
    
print("\n")
print("{} Train Sequences in total ".format(count_all_train ))
print("{} Val Sequences in total ".format(count_all_val ))
print("\n")

list_data_set_train = []

for idx, data in enumerate(list(data_list_train.keys())): 
    up_scale = int(data.split('_')[-1])
    for ids in list(data_list_train[data][0].keys()):
        list_data_set_train += [(data, ids) for _ in range(up_scale)]
    print("Train Data {} Upscale {}  Len data scale {} Tot len set".format( data, up_scale, len(data_list_train[data][0].keys())*up_scale), len(list_data_set_train))

list_data_set_val = []
for idx, data in enumerate(list(data_list_val.keys())): 
    up_scale = int(data.split('_')[-1])
    for ids in list(data_list_val[data][0].keys()):
        list_data_set_val+= [(data, ids) for _ in range(up_scale)]
    print("Val Data {} Upscale {}  Len data scale {} Tot len set".format( data,  up_scale, len(data_list_val[data][0].keys())*up_scale), len(list_data_set_val))



2645475 Train Sequences in file 82_103_105.fasta
13335 Val Sequences in file 82_103_105.fasta
2591163 Train Sequences in file 2_22_19.fasta
13034 Val Sequences in file 2_22_19.fasta
2212171 Train Sequences in file 22_42_1.fasta
11117 Val Sequences in file 22_42_1.fasta
2496715 Train Sequences in file 42_62_13.fasta
12558 Val Sequences in file 42_62_13.fasta
2564008 Train Sequences in file 62_82_34.fasta
12886 Val Sequences in file 62_82_34.fasta


12509532 Train Sequences in total 
62930 Val Sequences in total 


Train Data 82_103_105 Upscale 105  Len data scale 2645475 Tot len set 2645475
Train Data 2_22_19 Upscale 19  Len data scale 2591163 Tot len set 5236638
Train Data 22_42_1 Upscale 1  Len data scale 2212171 Tot len set 7448809
Train Data 42_62_13 Upscale 13  Len data scale 2496715 Tot len set 9945524
Train Data 62_82_34 Upscale 34  Len data scale 2564008 Tot len set 12509532
Val Data 82_103_105 Upscale 105  Len data scale 13335 Tot len set 13335
Val Data 2_22_19 Upscale 19  Len 

### Write record

In [21]:
shards = 100
index_item_train = np.random.choice(len(list_data_set_train), size = (len(list_data_set_train),), replace = False)
shards_range_train = np.linspace(0,len(list_data_set_train), num = shards)

# Write training set
if not os.path.isdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/train".format(n_groups-1))):
    os.mkdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/train".format(n_groups-1)))
for shard in range(shards-1):
    name = os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/train".format(n_groups-1), str(shard) +".tfrecord")
    print(name)
    print(shard, shards_range_train[shard], shards_range_train[shard+1])
    with tf.io.TFRecordWriter(name) as tfrecord:
        for index in index_item_train[int(shards_range_train[shard]):int(shards_range_train[shard+1])] if shard != shard-2 else index_item_train[int(shards_range_train[shard]):]:
            idx, ids = list_data_set_train[index]
            features = {
              'class': _float_feature(float(data_list_train[idx][0][ids][1])),
              'seq': _int64_feature(data_list_train[idx][0][ids][0])
              }
            element = tf.train.Example(features = tf.train.Features(feature=features))
            tfrecord.write(element.SerializeToString())



../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/0.tfrecord
0 0.0 126358.90909090909
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/1.tfrecord
1 126358.90909090909 252717.81818181818
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/2.tfrecord
2 252717.81818181818 379076.7272727273
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/3.tfrecord
3 379076.7272727273 505435.63636363635
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/4.tfrecord
4 505435.63636363635 631794.5454545454
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/5.tfrecord
5 631794.5454545454 758153.4545454546
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/6.tfrecord
6 758153.4545454546 884512.3636363636
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/train/7.tfrecord
7 884512.3636363636 1010871.2727272727
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/

ValueError: not enough values to unpack (expected 3, got 2)

In [14]:
idx, ids = list_data_set_train[0]
print(float(data_list_train[idx][0][ids][1]))

95.0


### Write record val

In [22]:
index_item_val = np.random.choice(len(list_data_set_val), size = (len(list_data_set_val),), replace = False)
shards_range_val = np.linspace(0,len(list_data_set_val), num = shards)

# Write validation set
if not os.path.isdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/val".format(n_groups-1))):
    os.mkdir(os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/val".format(n_groups-1)))
for shard in range(shards-1):
    name = os.path.join(BASE_DIR, pub, "Groups_{}/RECORDS_REG_GLOBAL/val".format(n_groups-1), str(shard) +".tfrecord")
    print(name)
    print(shard, shards_range_val[shard], shards_range_val[shard+1])
    with tf.io.TFRecordWriter(name) as tfrecord:
        for index in index_item_val[int(shards_range_val[shard]):int(shards_range_val[shard+1])] if shard != shard-2 else index_item_val[int(shards_range_val[shard]):]:
            idx, ids = list_data_set_val[index]
            features = {
              'class': _float_feature(float(data_list_val[idx][0][ids][1])),
              'seq': _int64_feature(data_list_val[idx][0][ids][0])
              }
            element = tf.train.Example(features = tf.train.Features(feature=features))
            tfrecord.write(element.SerializeToString())

../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/0.tfrecord
0 0.0 635.6565656565657
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/1.tfrecord
1 635.6565656565657 1271.3131313131314
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/2.tfrecord
2 1271.3131313131314 1906.969696969697
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/3.tfrecord
3 1906.969696969697 2542.626262626263
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/4.tfrecord
4 2542.626262626263 3178.2828282828286
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/5.tfrecord
5 3178.2828282828286 3813.939393939394
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/6.tfrecord
6 3813.939393939394 4449.59595959596
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/7.tfrecord
7 4449.59595959596 5085.252525252526
../../data/Combined_data/published/Groups_5/RECORDS_REG_GLOBAL/val/8.tfrecord
8 5085

## Decoding

In [5]:
sys.path.append("../../utils/")

from preprocessing import load_data

config ={
    "base_dir": os.path.join(BASE_DIR,pub,"Groups_5", "RECORDS_REG_GLOBAL"),
    "train_dir": "train/*.tfrecord",
    "val_dir": "val/*.tfrecord"
}

data_train, data_val = load_data(config)
for d in data_train.take(10):
    print(d[0][:10,:], d[1])


# Print 

tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(10, 21), dtype=float32) tf.Tensor(50.0, shape=(), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

92.5
12.0
32.0
52.0
72.0


In [12]:
print(os.listdir("../../data/Combined_data/published/"))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/Combined_data/Groups_5/RECORDS'