In [1]:
import os
import sys
import click
import pickle
import argparse
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences

from quantiprot.utils.mapping import simplify
from quantiprot.utils.io import load_fasta_file
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import SequenceSet, subset, columns

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def get_feature_map(index='JOND920101'):
    """ To get the feature mapping object 
        using the amino acid index given. 
    """
    
    # Create a Feature object
    aaindex_map = get_aaindex_file(index)
    aaindex_map.mapping['-'] = 0.0
    feat_map = Feature(aaindex_map)
    
    return feat_map

In [3]:
f = load_fasta_file('data/prep/Seg1p1_prep')
feat_map = get_feature_map()

In [4]:
print feat_map
print f

Quantiprot Feature object
  'name': JOND920101
  'function': JOND920101
  scanning 'window': 0
Quantiprot SequenceSet object
  'name': data/prep/Seg1p1_prep
  'unique': True
  number of sequences: 215


In [5]:
# Get the identifiers and the sequences
names, dataset = [], []
for i in range(len(f)):
    dataset.append(f[i])
    names.append(f[i].identifier)

In [6]:
enc = {}
for seq in dataset:
    enc[seq.identifier] = feat_map(seq).data

In [7]:
enc['tr|ASSEM0001_DQ208309p1_A/BrevigMission/1/1918(H1N1)_Seg1p1']

[0.0,
 0.0,
 0.024,
 0.062,
 0.051,
 0.053,
 0.059,
 0.062,
 0.091,
 0.051,
 0.052,
 0.091,
 0.024,
 0.069,
 0.041,
 0.069,
 0.051,
 0.059,
 0.051,
 0.062,
 0.053,
 0.091,
 0.059,
 0.059,
 0.059,
 0.059,
 0.066,
 0.052,
 0.023,
 0.024,
 0.077,
 0.053,
 0.053,
 0.059,
 0.059,
 0.032,
 0.059,
 0.069,
 0.074,
 0.051,
 0.041,
 0.062,
 0.059,
 0.043,
 0.051,
 0.077,
 0.091,
 0.051,
 0.024,
 0.059,
 0.014,
 0.024,
 0.024,
 0.077,
 0.024,
 0.059,
 0.032,
 0.051,
 0.053,
 0.059,
 0.077,
 0.052,
 0.059,
 0.051,
 0.053,
 0.024,
 0.062,
 0.024,
 0.053,
 0.051,
 0.062,
 0.051,
 0.043,
 0.062,
 0.041,
 0.074,
 0.041,
 0.059,
 0.091,
 0.014,
 0.069,
 0.059,
 0.059,
 0.043,
 0.052,
 0.077,
 0.074,
 0.069,
 0.052,
 0.051,
 0.066,
 0.024,
 0.066,
 0.069,
 0.051,
 0.091,
 0.077,
 0.066,
 0.059,
 0.014,
 0.014,
 0.043,
 0.051,
 0.043,
 0.074,
 0.051,
 0.059,
 0.059,
 0.069,
 0.077,
 0.066,
 0.023,
 0.032,
 0.051,
 0.059,
 0.053,
 0.032,
 0.059,
 0.059,
 0.032,
 0.04,
 0.062,
 0.059,
 0.066,
 0.062,
 0.05

In [8]:
lens = [len(val) for val in enc.values()]    

In [9]:
max(lens)

772

In [10]:
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
test = {}
for key in enc.keys():
    val = enc[key]
    val = np.reshape(val, (1, len(val)))
    val = pad_sequences(val, maxlen=772, dtype='float32', padding='pre', truncating='pre', value=0.0)
    val = val[0]
    test[key] = val

In [12]:
len(test.keys())

215

In [13]:
test['tr|ASSEM0001_DQ208309p1_A/BrevigMission/1/1918(H1N1)_Seg1p1'].shape

(772,)

In [15]:
for val in test.values():
    assert len(val) == 772

In [16]:
def pad_encoding(enc, pad_len):
    for key in enc.keys():
        val = enc[key]
        val = np.reshape(val, (1, len(val)))
        val = pad_sequences(val, maxlen=pad_len, 
                            dtype='float32', padding='pre', truncating='pre', value=0.0)
        enc[key] = val[0]
        
    return enc

In [21]:
def encoded_seq_from_file(fname, dirname):
    f = load_fasta_file(dirname + '/' + fname)
    feat_map = get_feature_map()
    
    dataset = [], []
    for i in range(len(f)):
        dataset.append(f[i])
        names.append(f[i].identifier)
        
    enc = {}
    for seq in dataset:
        enc[seq.identifier] = feat_map(seq).data
    
    maxlen = max([len(val) for val in enc.values()])
    enc = pad_encoding(enc, maxlen)

    for val in enc.values():
        assert len(val) == maxlen
    
    return enc

In [22]:
encs = []
for fname in os.listdir('data/prep'):
    encs.append(encoded_seq_from_file(fname, 'data/prep'))

In [28]:
global_max_len = max([len(enc.values()[0]) for enc in encs])

In [29]:
global_max_len

772

In [30]:
full_keys = []
for enc in encs:
    full_keys.append(enc.keys())

In [33]:
len(full_keys[2])

215