In [10]:
import os
import sys
import click
import pickle
import argparse
import numpy as np
import pandas as pd

from quantiprot.utils.mapping import simplify
from quantiprot.utils.io import load_fasta_file
from quantiprot.metrics.aaindex import get_aaindex_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.utils.sequence import SequenceSet, subset, columns

In [2]:
def get_feature_map(index='JOND920101'):
    """ To get the feature mapping object. """
    
    # Create a Feature object
    aaindex_map = get_aaindex_file(index)
    aaindex_map.mapping['-'] = 0.0
    feat_map = Feature(aaindex_map)
    
    return feat_map

In [20]:
f = load_fasta_file('data/prep/Seg1p1_prep')
feat_map = get_feature_map()

In [8]:
print feat_map
print f

Quantiprot Feature object
  'name': JOND920101
  'function': JOND920101
  scanning 'window': 0
Quantiprot SequenceSet object
  'name': data/prep/Seg1p1_prep
  'unique': True
  number of sequences: 215


In [24]:
# Get the identifiers and the sequences
names, dataset = [], []
for i in range(len(f)):
    dataset.append(f[i])
    names.append(f[i].identifier)

In [30]:
samp = dataset[0]

In [15]:
# Ensure no anomalies exist in dataframe
from preprocess_align import check_anomaly
assert not check_anomaly(df)

In [35]:
print feat_map(samp).data

[0.024, 0.059, 0.091, 0.077, 0.059, 0.053, 0.062, 0.091, 0.091, 0.059, 0.041, 0.091, 0.091, 0.051, 0.052, 0.043, 0.062, 0.077, 0.059, 0.059, 0.066, 0.091, 0.059, 0.041, 0.059, 0.059, 0.066, 0.052, 0.041, 0.032, 0.043, 0.053, 0.053, 0.051, 0.059, 0.04, 0.043, 0.059, 0.069, 0.051, 0.053, 0.062, 0.059, 0.043, 0.051, 0.069, 0.091, 0.051, 0.024, 0.059, 0.014, 0.077, 0.024, 0.02, 0.069, 0.043, 0.04, 0.051, 0.091, 0.077, 0.091, 0.059, 0.059, 0.074, 0.052, 0.024, 0.077, 0.043, 0.051, 0.053, 0.051, 0.091, 0.062, 0.032, 0.059, 0.074, 0.053, 0.041, 0.091, 0.059, 0.059, 0.043, 0.077, 0.062, 0.052, 0.053, 0.074, 0.059, 0.059, 0.074, 0.041, 0.024, 0.02, 0.069, 0.053, 0.077, 0.077, 0.066, 0.059, 0.014, 0.014, 0.043, 0.059, 0.032, 0.074, 0.051, 0.053, 0.074, 0.052, 0.059, 0.062, 0.074, 0.04, 0.062, 0.051, 0.066, 0.032, 0.062, 0.069, 0.04, 0.04, 0.091, 0.051, 0.059, 0.024, 0.051, 0.091, 0.052, 0.043, 0.077, 0.059, 0.014, 0.074, 0.051, 0.053, 0.059, 0.04, 0.074, 0.051, 0.066, 0.062, 0.051, 0.066, 0.051,

In [38]:
enc = {}
for seq in dataset:
    enc[seq.identifier] = feat_map(seq).data

In [43]:
enc['tr|ASSEM0001_DQ208309p1_A/BrevigMission/1/1918(H1N1)_Seg1p1']

[0.0,
 0.0,
 0.024,
 0.062,
 0.051,
 0.053,
 0.059,
 0.062,
 0.091,
 0.051,
 0.052,
 0.091,
 0.024,
 0.069,
 0.041,
 0.069,
 0.051,
 0.059,
 0.051,
 0.062,
 0.053,
 0.091,
 0.059,
 0.059,
 0.059,
 0.059,
 0.066,
 0.052,
 0.023,
 0.024,
 0.077,
 0.053,
 0.053,
 0.059,
 0.059,
 0.032,
 0.059,
 0.069,
 0.074,
 0.051,
 0.041,
 0.062,
 0.059,
 0.043,
 0.051,
 0.077,
 0.091,
 0.051,
 0.024,
 0.059,
 0.014,
 0.024,
 0.024,
 0.077,
 0.024,
 0.059,
 0.032,
 0.051,
 0.053,
 0.059,
 0.077,
 0.052,
 0.059,
 0.051,
 0.053,
 0.024,
 0.062,
 0.024,
 0.053,
 0.051,
 0.062,
 0.051,
 0.043,
 0.062,
 0.041,
 0.074,
 0.041,
 0.059,
 0.091,
 0.014,
 0.069,
 0.059,
 0.059,
 0.043,
 0.052,
 0.077,
 0.074,
 0.069,
 0.052,
 0.051,
 0.066,
 0.024,
 0.066,
 0.069,
 0.051,
 0.091,
 0.077,
 0.066,
 0.059,
 0.014,
 0.014,
 0.043,
 0.051,
 0.043,
 0.074,
 0.051,
 0.059,
 0.059,
 0.069,
 0.077,
 0.066,
 0.023,
 0.032,
 0.051,
 0.059,
 0.053,
 0.032,
 0.059,
 0.059,
 0.032,
 0.04,
 0.062,
 0.059,
 0.066,
 0.062,
 0.05

In [44]:
lens = [len(val) for val in enc.values()]    

In [45]:
max(lens)

772

In [46]:
from keras.preprocessing.sequence import pad_sequences

In [51]:
test = {}
for key in enc.keys():
    val = enc[key]
    val = np.reshape(val, (1, len(val)))
    val = pad_sequences(val, maxlen=772, dtype='float32', padding='pre', truncating='pre', value=0.0)
    val = val[0]
    test[key] = val

In [52]:
len(test.keys())

215

In [54]:
test['tr|ASSEM0001_DQ208309p1_A/BrevigMission/1/1918(H1N1)_Seg1p1'].shape

(772,)