# Create Feature Matrix
This notebook serves as a guide to compile a complete feature matrix using various computational tools

( Insert figure from paper here?)

In [2]:
%matplotlib inline
import pandas as pd
import matplotlib as plt
import numpy as np
import os
import sys
import re
import subprocess
import itertools
from tqdm import tqdm
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [3]:
# ROOT_DIR - root directory
ROOT_DIR = os.getcwd()+'/'

# FEATURE_DIR - directory where feature dataframes are saved
DATA_DIR = ROOT_DIR + 'dataframes/'

# FASTA_DIR - directory where fasta files are saved
FASTA_DIR = ROOT_DIR + 'fasta_files/'

# MISC_DIR - directory where all other files are saved
MISC_DIR = ROOT_DIR + 'misc_files/'

# SOFTWARE_DIR - directory where all software is stored
SOFTWARE_DIR = '/home/anand/Documents/HT_Expression/prediction_software/'

# Table of Contents

1. [Initialize Data](#Initialize-Data)
2. [Generate Features](#Generate-Features)
    1. [mRNA Properties](#mRNA-Properties)
        1. [Codon Counts](#Codon-Counts)
        2. [tRNA Adaptation Index](#tRNA-Adaptation-Index)
        3. [Folding Energy](#Folding-Energy)
    2. [Physical Properties](#Physical-Properties)
3. [Compile Features](#Compile-Features)

# Initialize Data 
[Back to Top](#Table-of-Contents)

In [3]:
prest_file = DATA_DIR+'prEST_conc_data.csv'
DF_raw_prest = pd.read_csv(prest_file,index_col=0)
print 'Number of entries:', len(DF_raw_prest)
DF_raw_prest.head()

Number of entries: 46521


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


Some prESTs were expressed in duplicate experiments. Merge these results so each sequence has a single concentration.

In [4]:
print 'Number of total entries:',len(DF_raw_prest)
print 'Number of unique prESTs:',len(DF_raw_prest.aa_seq.unique())

Number of total entries: 46521
Number of unique prESTs: 45206


In [5]:
dropped = []
DF_prest = DF_raw_prest.copy()
for name,group in DF_raw_prest.groupby('aa_seq'):
    if len(group)>1:
        DF_prest.loc[group.index[0],'conc_cf'] = group.conc_cf.mean()
        dropped += group.index[1:].tolist()
DF_prest = DF_prest.drop(dropped).reset_index(drop=True)

In [6]:
print 'Final number of prESTs:',len(DF_prest)
DF_prest.head()

Final number of prESTs: 45206


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


Save the dataframe

In [7]:
DF_prest.to_csv(DATA_DIR+'DF_prest.csv')

# Generate Features
[Back to Top](#Table-of-Contents)

## mRNA Properties
[Back to Top](#Table-of-Contents)

In [4]:
DF_mRNA_features = pd.read_csv(DATA_DIR+'DF_prest.csv',index_col=0)

In [5]:
# Directory where MFold outputs are stored
MFOLD_DIR = MISC_DIR+'mfold_out/'

# Directory that contains the tAI software
TAI_DIR = '/home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/'

In [6]:
# Export fasta files for all mRNA sequences and their first 40 nts

sequences = []
sequences40 = []

for i,row in tqdm(DF_mRNA_features.iterrows()):

    name = FASTA_DIR+'nt_sequences/'+str(row.prest_id)+'.fasta'
    name40 = FASTA_DIR+'nt_sequences/'+str(row.prest_id)+'_40nt.fasta'
    
    # Export FASTA files for all mRNA sequences
    if not os.path.isfile(name):
        seq = SeqRecord(Seq(row.nt_seq).transcribe(), id=str(row.prest_id),description='prEST #'+str(row.prest_id))
        sequences.append(seq)
        with open(name,'w') as f:
            SeqIO.write(seq,f,'fasta')
        
    # Export FASTA files for first 40 nts of all mRNA sequences
    if not os.path.isfile(name40):
        seq40 = SeqRecord(Seq(row.nt_seq).transcribe()[:40], id=str(row.prest_id),description='prEST #'+str(row.prest_id))
        sequences40.append(seq40)
        with open(name40,'w') as f:
            SeqIO.write(seq40,f,'fasta')

if not os.path.isfile(FASTA_DIR+'all_nt.fasta'):
    with open(FASTA_DIR+'all_nt.fasta','w') as f:
        SeqIO.write(sequences,f,'fasta')
if not os.path.isfile(FASTA_DIR+'all_nt_40.fasta'):
    with open(FASTA_DIR+'all_nt_40.fasta','w') as f:
        SeqIO.write(sequences40,f,'fasta')



### Codon Counts
[Back to Top](#Table-of-Contents)

The RNA sequences have some extra nucleotides at their head/tail, which need to be removed before selecting the codons

In [7]:
seqs = []
for i,row in tqdm(DF_mRNA_features.iterrows()):
    trans_seq = Seq(row.nt_seq).translate()
    pos = trans_seq.find(row.aa_seq)
    seqs.append(row.nt_seq[pos*3:(pos+len(row.aa_seq))*3])
    
DF_mRNA_features.loc[:,'true_nt_seq'] = seqs



In [8]:
DF_mRNA_features.head()

Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len,true_nt_seq
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139,ATTATGACAGCTCCCTCCAGTTTTGAGCAGTTTAAAGTGGCAATGA...
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144,ACCTACTATGCCTGGAAGCATGAGCTGCTGGGCTCTGGCACCTGCC...
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136,TCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTG...
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123,GCGAGAGCATTAAATGAAAGCAAAAGAGTTAATAATGGCAACACGG...
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124,CATCGGAAAGAGCCTGGGGCAAGGCTGGAGGCCACAAGAGGAGCTG...


In [9]:
# Generate a list of all codons

all_codons = [''.join(a) for a in itertools.product('ATCG',repeat=3)]

# For each sequence, split the sequence into groups of 3

seq_codons=[]
for i,row in tqdm(DF_mRNA_features.iterrows()):
    seq_codons.append([row.true_nt_seq[j:j+3] for j in range(0, len(row.true_nt_seq), 3)])

# Calculate the fraction of each codon appearing in each sequence
    
counts = [[np.true_divide(seq_codon.count(codon),len(seq_codon)) for codon in all_codons] for seq_codon in seq_codons]

# Initialize the dataframe columns

for codon in all_codons:
    DF_mRNA_features[codon] = None
    
# Add the data to the feature matrix
    
DF_mRNA_features.loc[:,all_codons] = counts



In [10]:
DF_mRNA_features.head()

Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len,true_nt_seq,AAA,AAT,AAC,...,GTC,GTG,GCA,GCT,GCC,GCG,GGA,GGT,GGC,GGG
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139,ATTATGACAGCTCCCTCCAGTTTTGAGCAGTTTAAAGTGGCAATGA...,0.057554,0.043165,0.021583,...,0.0,0.028777,0.021583,0.028777,0.007194,0.0,0.007194,0.007194,0.014388,0.014388
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144,ACCTACTATGCCTGGAAGCATGAGCTGCTGGGCTCTGGCACCTGCC...,0.006944,0.013889,0.013889,...,0.006944,0.020833,0.027778,0.027778,0.041667,0.013889,0.006944,0.006944,0.027778,0.034722
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136,TCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTG...,0.007353,0.051471,0.058824,...,0.0,0.014706,0.036765,0.014706,0.014706,0.0,0.007353,0.014706,0.0,0.0
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123,GCGAGAGCATTAAATGAAAGCAAAAGAGTTAATAATGGCAACACGG...,0.04065,0.056911,0.04065,...,0.01626,0.04065,0.00813,0.04065,0.03252,0.00813,0.02439,0.0,0.01626,0.00813
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124,CATCGGAAAGAGCCTGGGGCAAGGCTGGAGGCCACAAGAGGAGCTG...,0.032258,0.008065,0.008065,...,0.0,0.008065,0.016129,0.024194,0.032258,0.0,0.024194,0.016129,0.0,0.024194


### SD Sequences
[Back to Top](#Table-of-Contents)

Count the number of Shine-Delgarno (SD) and SD-like sequences on both the forward and reverse strands

In [11]:
def complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 
    bases = list(seq) 
    bases = [complement[base] for base in bases] 
    return ''.join(bases)

def reverse_complement(s):
    return complement(s[::-1])

def count_sd_like(seq):
    sd_like_pat=r'(?=(AGG)|(GGA)|(GAG)|(GGG)|(GGT)|(GTG))'
    revcomp = reverse_complement(seq)
    return [len([m.start() for m in re.finditer(sd_like_pat, seq)]),
            len([m.start() for m in re.finditer(sd_like_pat, revcomp)])]

def count_sd(seq):
    sd_pat = r'(?=(GG.GG.))'
    revcomp = reverse_complement(seq)
    return [len([m.start() for m in re.finditer(sd_pat, seq)]),
            len([m.start() for m in re.finditer(sd_pat, revcomp)])]

In [13]:
sd_like = [count_sd_like(seq) for seq in DF_mRNA_features.true_nt_seq]

DF_mRNA_features['sd_like_fwd'] = [num[0] for num in sd_like]
DF_mRNA_features['sd_like_rev'] = [num[1] for num in sd_like]
DF_mRNA_features['sd_like_fwd_frac'] = np.true_divide(DF_mRNA_features['sd_like_fwd'],
                                                      [len(seq) for seq in DF_mRNA_features.true_nt_seq])
DF_mRNA_features['sd_like_rev_frac'] = np.true_divide(DF_mRNA_features['sd_like_rev'],
                                                      [len(seq) for seq in DF_mRNA_features.true_nt_seq])


sd = [count_sd(seq) for seq in DF_mRNA_features.true_nt_seq]
DF_mRNA_features['sd_seq_fwd'] = [num[0] for num in sd]
DF_mRNA_features['sd_seq_rev'] = [num[1] for num in sd]
DF_mRNA_features['sd_seq_fwd_frac'] = np.true_divide(DF_mRNA_features['sd_seq_fwd'],
                                                     [len(seq) for seq in DF_mRNA_features.true_nt_seq])
DF_mRNA_features['sd_seq_rev_frac'] = np.true_divide(DF_mRNA_features['sd_seq_rev'],
                                                     [len(seq) for seq in DF_mRNA_features.true_nt_seq])

In [14]:
DF_mRNA_features.head()

Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len,true_nt_seq,AAA,AAT,AAC,...,GGC,GGG,sd_like_fwd,sd_like_rev,sd_like_fwd_frac,sd_like_rev_frac,sd_seq_fwd,sd_seq_rev,sd_seq_fwd_frac,sd_seq_rev_frac
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139,ATTATGACAGCTCCCTCCAGTTTTGAGCAGTTTAAAGTGGCAATGA...,0.057554,0.043165,0.021583,...,0.014388,0.014388,32,28,0.076739,0.067146,2,1,0.004796,0.002398
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144,ACCTACTATGCCTGGAAGCATGAGCTGCTGGGCTCTGGCACCTGCC...,0.006944,0.013889,0.013889,...,0.027778,0.034722,77,56,0.178241,0.12963,10,3,0.023148,0.006944
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136,TCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTG...,0.007353,0.051471,0.058824,...,0.0,0.0,34,52,0.083333,0.127451,1,4,0.002451,0.009804
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123,GCGAGAGCATTAAATGAAAGCAAAAGAGTTAATAATGGCAACACGG...,0.04065,0.056911,0.04065,...,0.01626,0.00813,40,21,0.108401,0.056911,2,0,0.00542,0.0
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124,CATCGGAAAGAGCCTGGGGCAAGGCTGGAGGCCACAAGAGGAGCTG...,0.032258,0.008065,0.008065,...,0.0,0.024194,41,29,0.110215,0.077957,1,0,0.002688,0.0


### tRNA Adaptation Index
[Back to Top](#Table-of-Contents)

Run the tAI code on all 45206 prESTs

In [15]:
command1 = ['perl', TAI_DIR+'codonM', FASTA_DIR+'all_nt.fasta',MISC_DIR+'tAI_files/prEST_tAI.m']
print 'Running Command:'+' '.join(command1) + '\n'

print subprocess.check_output(command1)

command2 = ['Rscript',MISC_DIR+'tAI_files/calc_tAI.R',MISC_DIR+'tAI_files/',TAI_DIR]
print 'Running Command:'+' '.join(command2) + '\n'

print subprocess.check_output(command2)

Running Command:perl /home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/codonM /home/anand/Documents/HT_Expression/ML_HPA/fasta_files/all_nt.fasta /home/anand/Documents/HT_Expression/ML_HPA/misc_files/tAI_files/prEST_tAI.m

Analysing /home/anand/Documents/HT_Expression/ML_HPA/fasta_files/all_nt.fasta, please be patient ...
	DONE

Running Command:Rscript /home/anand/Documents/HT_Expression/ML_HPA/misc_files/tAI_files/calc_tAI.R /home/anand/Documents/HT_Expression/ML_HPA/misc_files/tAI_files/ /home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/

[1] "tAI calculated and stored in prEST_tAI.csv"



Now add the tAI as a feature

In [16]:
DF_mRNA_features.loc[:,'tAI'] = pd.read_csv(MISC_DIR+'tAI_files/prEST_tAI.csv')['x']

### Folding Energy
[Back to Top](#Table-of-Contents)

Calculation of all folding energies may take up to 15 hours.

In [18]:
print 'Command: \'mfold SEQ=\''+FASTA_DIR+'nt_sequences/PREST_ID.fasta\' MAX=1\''

os.chdir(MFOLD_DIR)

energy = []
for prest_id in tqdm(DF_mRNA_features.prest_id):

    # Run mRNA folding energy prediction software on full mRNA
    if not os.path.isfile(MFOLD_DIR+str(prest_id)+'.ct'):
        command = 'mfold SEQ=\''+FASTA_DIR+'nt_sequences/'+str(prest_id)+'.fasta\' MAX=1'
        os.system(command)
    
    # Run mRNA folding energy prediction software on first 40 nucleotides
    if not os.path.isfile(MFOLD_DIR+str(prest_id)+'_40nt'+'.ct'):
        command = 'mfold SEQ=\''+FASTA_DIR+'nt_sequences/'+str(prest_id)+'_40nt.fasta\' MAX=1'
        os.system(command)
    
    # Remove non-essential files
    for f in os.listdir(MFOLD_DIR):
        if not f.endswith('.ct') or f.endswith('_1.ct'):
            os.remove(f)

  0%|          | 0/45206 [00:00<?, ?it/s]

Command: 'mfold SEQ='/home/anand/Documents/HT_Expression/ML_HPA/fasta_files/nt_sequences/PREST_ID.fasta' MAX=1'


In [24]:
for i,row in tqdm(DF_mRNA_features.iterrows()):
    # Parse results
    if os.path.isfile(MFOLD_DIR+str(row.prest_id)+'.ct'):
        with open(MFOLD_DIR+str(row.prest_id)+'.ct','r') as f:
            result = f.read()
            energy = re.search('-\d*\.\d*',result)
            if not energy:
                energy = re.search('\d*\.\d*',result)
            DF_mRNA_features.loc[i,'RNA_folding_energy'] = float(energy.group())
            
    if os.path.isfile(MFOLD_DIR+str(row.prest_id)+'_40nt.ct'):
        with open(MFOLD_DIR+str(row.prest_id)+'_40nt.ct','r') as f:
            result = f.read()
            energy = re.search('-\d*\.\d*',result)
            if not energy:
                energy = re.search('\d*\.\d*',result)
            DF_mRNA_features.loc[i,'RNA_40_energy'] = float(energy.group())



In [25]:
DF_mRNA_features

Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len,true_nt_seq,AAA,AAT,AAC,...,sd_like_rev,sd_like_fwd_frac,sd_like_rev_frac,sd_seq_fwd,sd_seq_rev,sd_seq_fwd_frac,sd_seq_rev_frac,tAI,RNA_folding_energy,RNA_40_energy
0,140095,G3V3N0,4.30750,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139,ATTATGACAGCTCCCTCCAGTTTTGAGCAGTTTAAAGTGGCAATGA...,0.057554,0.043165,0.021583,...,28,0.076739,0.067146,2,1,0.004796,0.002398,0.308574,-111.56,-3.70
1,140099,G3V537,2.91540,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144,ACCTACTATGCCTGGAAGCATGAGCTGCTGGGCTCTGGCACCTGCC...,0.006944,0.013889,0.013889,...,56,0.178241,0.129630,10,3,0.023148,0.006944,0.252830,-192.52,-10.00
2,140225,P12724,1.48770,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136,TCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTG...,0.007353,0.051471,0.058824,...,52,0.083333,0.127451,1,4,0.002451,0.009804,0.230369,-113.76,-4.00
3,140235,H0YH02,6.72240,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123,GCGAGAGCATTAAATGAAAGCAAAAGAGTTAATAATGGCAACACGG...,0.040650,0.056911,0.040650,...,21,0.108401,0.056911,2,0,0.005420,0.000000,0.356271,-97.22,-5.74
4,140309,F5GYC5,3.38480,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124,CATCGGAAAGAGCCTGGGGCAAGGCTGGAGGCCACAAGAGGAGCTG...,0.032258,0.008065,0.008065,...,29,0.110215,0.077957,1,0,0.002688,0.000000,0.296122,-107.98,-9.00
5,140325,O43506,1.50290,FVGWWTHQRFVELVVVVDNIRYLFSQSNATTVQHEVFNVVNIVDSF...,GACAAGCTTGCGGCCGCATTTGTGGGCTGGTGGACCCATCAGCGGT...,137,TTTGTGGGCTGGTGGACCCATCAGCGGTTTGTTGAGCTGGTAGTGG...,0.014599,0.072993,0.021898,...,23,0.092457,0.055961,2,0,0.004866,0.000000,0.284962,-101.95,-12.10
6,140342,H0YJF0,3.52970,ASHGPMSLGELELEPNSKLVLPTTLLTAQENDVNLPVAAEDFSQYQ...,GACAAGCTTGCGGCCGCAGCTAGTCATGGTCCAATGAGTTTGGGAG...,146,GCTAGTCATGGTCCAATGAGTTTGGGAGAATTGGAGTTGGAGCCAA...,0.020548,0.041096,0.006849,...,44,0.089041,0.100457,2,1,0.004566,0.002283,0.296340,-114.72,-10.90
7,140345,Q96BY2,3.40160,RKALLIAGISQSCSVAEIEEALQAGLAPLGEYRLLGRMFRRDENRK...,GACAAGCTTGCGGCCGCACGGAAAGCGCTATTGATTGCCGGCATCT...,141,CGGAAAGCGCTATTGATTGCCGGCATCTCCCAGAGCTGCAGTGTGG...,0.021277,0.021277,0.007092,...,38,0.165485,0.089835,4,1,0.009456,0.002364,0.286408,-151.90,-6.70
8,140354,O95072,0.92003,RRRLLFWDKETQISPEKFQEQLQTRAHCWECPMVQPPERTIRGPAE...,GACAAGCTTGCGGCCGCACGTCGCCGGTTACTGTTCTGGGACAAGG...,134,CGTCGCCGGTTACTGTTCTGGGACAAGGAGACTCAGATCTCCCCGG...,0.014925,0.000000,0.000000,...,59,0.166667,0.146766,3,4,0.007463,0.009950,0.255174,-148.58,-10.60
9,140443,H0YJ73,6.01710,IKSCCGGCFYGETEKHNFSVERDFKAAVPNSQNATISVPPLTSVSV...,GACAAGCTTGCGGCCGCAATAAAGAGTTGCTGTGGAGGATGTTTCT...,139,ATAAAGAGTTGCTGTGGAGGATGTTTCTATGGTGAAACAGAAAAAC...,0.028777,0.014388,0.014388,...,33,0.071942,0.079137,1,1,0.002398,0.002398,0.255403,-108.15,-8.50


In [27]:
DF_mRNA_features.to_csv(DATA_DIR+'mRNA_features.csv')

## Physical Properties
[Back to Top](#Table-of-Contents)

In [None]:
DF_physical_features = pd.read_csv(DATA_DIR+'DF_prest.csv',index_col=0)

The BioPython ProtParam quickly calculates many physical properties for any peptide

In [None]:
from Bio.SeqUtils import ProtParam

mw = []
pI= []
arom = []
instab = []
gravy = []
for seq in tqdm(DF_physical_features.aa_seq):
    data = ProtParam.ProteinAnalysis(seq)
    mw.append(data.molecular_weight())
    pI.append(data.isoelectric_point())
    arom.append(data.aromaticity())
    instab.append(data.instability_index())
    gravy.append(data.gravy())
    
DF_physical_features['bio_pI'] = pI
DF_physical_features['bio_mW'] = mw
DF_physical_features['bio_aromaticity'] = arom
DF_physical_features['bio_instability'] = instab
DF_physical_features['bio_gravy'] = gravy

In [None]:
amino_acids = 'ARNDCQEGHILKMFPSTWYV'
counts = [[np.true_divide(seq.count(aa),len(seq)) for seq in DF_physical_features.aa_seq] for aa in amino_acids]
for i in range(len(amino_acids)):
    DF_physical_features['list_comp_'+amino_acids[i]] = counts[i]

In [None]:
DF_physical_features['charge'] = [seq.count('K')+seq.count('R')-seq.count('D')-seq.count('E') for seq in DF_physical_features.aa_seq]
DF_physical_features['abs_charge']=abs(DF_physical_features['charge'])
DF_physical_features['avg_charge']=np.true_divide(DF_physical_features['charge'],[len(seq) for seq in DF_physical_features['aa_seq']])
DF_physical_features['abs_avg_charge'] = abs(DF_physical_features['avg_charge'])

In [None]:
DF_physical_features['frac_aliphatic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'AGILPV'])
DF_physical_features['frac_aromatic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'FWY'])
DF_physical_features['frac_uncharged_polar'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'STNQ'])
DF_physical_features['frac_polar'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'QNHSTYCMW'])
DF_physical_features['frac_hydrophobic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'AGILPVF'])
DF_physical_features['frac_positive'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'HKR'])
DF_physical_features['frac_sulfur'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'CM'])
DF_physical_features['frac_negative'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'DE'])
DF_physical_features['frac_amide'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'NQ'])
DF_physical_features['frac_alcohol'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'ST'])

In [None]:
DF_physical_features.to_csv(DATA_DIR+'physical_features.csv')