In [26]:
def encode_sequences_from_file(fname, dirname):
    """ Function to get the encodings for all
        sequences from a fasta file which is aligned.

        This encoding method uses AAIndex to get the
        encodings. The amino acid index used for this
        currently is 'JOND920101' - Relative frequency 
        of occurrence (Jones et al., 1992).

        Parameters
        ----------
        fname: str
            Name of the file from which to read sequences.

        dirname: str
            Name of the file in which 'fname' is present.

        Returns
        -------
        encoded: list
            A list of the encoded sequences.

        log: list
            A log containing any errors encountered.	
    """

    # Define the path to the file
    fpath = dirname + '/' + fname

    # Read file as a set of Quantiprot sequences
    sequences = load_fasta_file(fpath)

    # Create a Feature object
    aa2freq_map = get_aaindex_file("JOND920101")
    aa2freq_map.mapping['-'] = 0.0
    aa2freq_map.mapping['X'] = 0.0
    freq_feat = Feature(aa2freq_map)

    # Encode sequences using relative frequency
    encoded, log = [], []
    for seq in sequences:
        try:
            f = freq_feat(seq)
            encoded.append(f.data)
        except:
            # For logging errors
            e, message, _tb = sys.exc_info()
            log.append(fname + ': line ' + str(e) + ': ' + str(message))
            continue

    return encoded, log

In [1]:
import os
import sys
import click
import pickle
import argparse

from quantiprot.utils.mapping import simplify
from quantiprot.utils.io import load_fasta_file
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import SequenceSet, subset, columns

In [27]:
enc, log = encode_sequences_from_file('Seg2p1_aligned', 'aligned')

In [28]:
np.shape (enc)

(215, 759)

In [29]:
log

[]

In [3]:
sequences = load_fasta_file('aligned/Seg2p2_aligned')

aa2freq_map = get_aaindex_file("JOND920101")
freq_feat = Feature(aa2freq_map)

In [5]:
print freq_feat

Quantiprot Feature object
  'name': JOND920101
  'function': JOND920101
  scanning 'window': 0


In [10]:
m = aa2freq_map

In [14]:
m.mapping['-'] = 0.0

In [15]:
print m

Quantiprot Mapping object
  '__name__': JOND920101
  'mapping': {'A': 0.077, 'C': 0.02, 'E': 0.062, 'D': 0.052, 'G': 0.074, 'F': 0.04, 'I': 0.053, 'H': 0.023, 'K': 0.059, '-': 0.0, 'M': 0.024, 'L': 0.091, 'N': 0.043, 'Q': 0.041, 'P': 0.051, 'S': 0.069, 'R': 0.051, 'T': 0.059, 'W': 0.014, 'V': 0.066, 'Y': 0.032}
  'default': None
  'misc': {'journal': 'CABIOS 8, 275-282 (1992)', 'title': 'The rapid generation of mutation data matrices from protein sequences', 'index_id': 'JOND920101', 'description': 'Relative frequency of occurrence (Jones et al., 1992)', 'authors': 'Jones, D.T., Taylor, W.R. and Thornton, J.M.'}


In [16]:
freq_feat = Feature(m)

In [30]:
x = freq_feat(sequences[99])

In [31]:
print x

Quantiprot Sequence object
  'identifier': tr|ASSEM0018_ID0034p2_A/ma468-G1-1/2014(H5N8)_Seg2p2
  'feature': JOND920101
  'data': [0.024, 0.062, 0.041, 0.062, 0.041, 0.052, 0.059, 0.051, 0.014, 0.059, 0.041, 0.069, 0.059, 0.062, 0.051, 0.053, 0.043, 0.059, 0.041, 0.059, 0.051, 0.062, 0.069, 0.074, 0.041, 0.041, 0.059, 0.051, 0.051, 0.091, 0.062, 0.023, 0.051, 0.043, 0.069, 0.059, 0.041, 0.091, 0.024, 0.052, 0.023, 0.032, 0.091, 0.051, 0.053, 0.059, 0.069, 0.041, 0.077, 0.052, 0.024, 0.023, 0.059, 0.051, 0.053, 0.066, 0.02, 0.014, 0.059, 0.041, 0.014, 0.091, 0.04, 0.091, 0.059, 0.069, 0.051, 0.059, 0.041, 0.074, 0.069, 0.091, 0.059, 0.059, 0.023, 0.066, 0.091, 0.059, 0.041, 0.014, 0.059, 0.091, 0.04, 0.043, 0.059, 0.041, 0.062, 0.014, 0.059, 0.043]


In [34]:
directory = 'aligned'

files = os.listdir(directory)

# Encode all sequences in all files
# in the given directory
full_log, enc_seqs = [], []
for fname in files:
    enc, log = encode_sequences_from_file(fname, directory)
    enc_seqs.append(enc)
    full_log += log

In [35]:
full_log

["Seg5p1_aligned: line <type 'exceptions.KeyError'>: 'B'"]

In [36]:
np.shape(enc_seqs)

(13,)

In [37]:
def pad_sequences(seqs):
	lengths = [len(x) for x in seqs]
	m = max(lengths)

	for i in range(len(seqs)):
		while (len(seqs[i]) < max):
			seqs[i].append(0.)

	return seqs

In [38]:
padded = pad_sequences(enc_seqs)

KeyboardInterrupt: 

In [39]:
for a in enc_seqs:
    print np.shape(a)

MemoryError: 