## Feature Generation

To generate the features from the Influenza virus strains, the following steps are taken:
1. Extract the specific segments from each of the different viruses and store them into separate segment files.
2. Align each segment file using Clustal Omega and store back as a fasta file.
3. Preprocess each alignment file by replacing any unknown symbols except '-' with an 'X'.
4. For each alignment file, encode the sequences from the file according to AAIndex (Relative frequency of occurence) using Quantiprot.
5. Gather the encoded sequences from each alignment file in the form of a dictionary where the keys are the identifiers in the file or the Influenza strains.
6. Recombine the sequences corresponding to each segment for each Influenza strain.
7. For each virus strain, concatenate its sequences to form a large embedding (~10e4).
8. Use an autoencoder to reduce the dimensionality to around 100 features by training all samples on it once and then using the encoder to encode these sequences.

**Setting the environment**

In [1]:
import os
import sys
import click
import pickle
import argparse
import numpy as np

from quantiprot.utils.mapping import simplify
from quantiprot.utils.io import load_fasta_file
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import SequenceSet, subset, columns

In [2]:
def encode_sequences_from_file(fname, dirname):
    """ Function to get the encodings for all
        sequences from a fasta file which is aligned.

        This encoding method uses AAIndex to get the
        encodings. The amino acid index used for this
        currently is 'JOND920101' - Relative frequency 
        of occurrence (Jones et al., 1992).

        Parameters
        ----------
        fname: str
            Name of the file from which to read sequences.

        dirname: str
            Name of the file in which 'fname' is present.

        Returns
        -------
        encoded: list
            A list of the encoded sequences.

        log: list
            A log containing any errors encountered.
    """

    # Define the path to the file
    fpath = dirname + '/' + fname

    # Read file as a set of Quantiprot sequences
    sequences = load_fasta_file(fpath)

    # Create a Feature object
    aa2freq_map = get_aaindex_file("JOND920101")
    aa2freq_map.mapping['-'] = 0.0
    aa2freq_map.mapping['X'] = 0.0
    freq_feat = Feature(aa2freq_map)

    # Encode sequences using relative frequency
    encoded, log = [], []
    for seq in sequences:
        try:
            f = freq_feat(seq)
            encoded.append(f.data)
        except:
            # For logging errors
            e, message, _tb = sys.exc_info()
            log.append(fname + ': line ' + str(e) + ': ' + str(message))
            continue

    return encoded, log

Encode the sequences for each alignment file.

In [3]:
dirname = 'aligned'
files = os.listdir(dirname)

# Encode all sequences in all files
# in the given directory
full_log, enc_seqs = [], []
for fname in files:
    enc, log = encode_sequences_from_file(fname, dirname)
    enc_seqs.append(enc)
    full_log += log

In [None]:
np.shape(log)

In [None]:
log

In [None]:
np.shape(enc_seqs)

In [None]:
for x in enc_seqs:
    print np.shape(x)

In [None]:
full_log

In [None]:
for i in range(len(enc_seqs)):
    enc_seqs[i] = np.array(enc_seqs[i])

In [None]:
enc_seqs[0]

In [None]:
sizes = []
for a in enc_seqs:
    sizes.append(a.shape[1])

In [None]:
m = max(sizes)

In [None]:
m

In [None]:
samp = enc_seqs[0]

In [None]:
samp

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
s = pad_sequences(samp, maxlen=772, dtype='float32', padding='post', value=0.0)

In [None]:
s

In [None]:
for i in range(len(enc_seqs)):
    enc_seqs[i] = pad_sequences(enc_seqs[i], maxlen=772, dtype='float32', padding='post', value=0.0)

In [None]:
samp

In [None]:
np.shape(enc_seqs)

In [None]:
enc_seqs[8]

In [None]:
for a in enc_seqs:
    print a.shape

In [None]:
np.shape(enc_seqs[0][1])

In [None]:
list(np.zeros((772,)))

In [None]:
sequences = load_fasta_file('aligned/Seg2p1_aligned')

In [None]:
print sequences

In [None]:
print sequences[0]

In [None]:
ids = []
for s in sequences:
    ids.append(s.identifier)

In [None]:
ids

In [None]:
def get_identifiers(fpath):
    sequences = load_fasta_file(fpath)
    
    print sequences
    
    ids = []
    for s in sequences:
        i = s.identifier
        i = i.split('_')
        i.pop()
        i = '_'.join(i)
        i = i.split('|')
        i.pop(0)
        i = ''.join(i)
        ids.append(i)
        
    return ids

In [None]:
i1 = get_identifiers('aligned/Seg2p1_aligned')

In [None]:
i1

In [None]:
i2 = get_identifiers('aligned/Seg3p1_aligned')

In [None]:
i2

In [None]:
set(i1).intersection(i2)

In [None]:
i1[0]

In [None]:
i2[0]

In [None]:
i1[0] == i2[0]

In [None]:
r = []
for x in os.listdir('aligned'):
    print x
    r.append(get_identifiers('aligned/' + x))

In [None]:
np.shape(r)

In [None]:
for a in r:
    print np.shape(a)

In [None]:
i1


In [None]:
i2