# Create Feature Matrix
This notebook serves as a guide to compile a complete feature matrix using various computational tools

( Insert figure from paper here?)

In [33]:
%matplotlib inline
import pandas as pd
import matplotlib as plt
import numpy as np
import os
import sys
import re
import subprocess
from tqdm import tqdm

In [2]:
# ROOT_DIR - root directory
ROOT_DIR = os.getcwd()+'/'

# FEATURE_DIR - directory where feature dataframes are saved
DATA_DIR = ROOT_DIR + 'dataframes/'

# FASTA_DIR - directory where fasta files are saved
FASTA_DIR = ROOT_DIR + 'fasta_files/'

# MISC_DIR - directory where all other files are saved
MISC_DIR = ROOT_DIR + 'misc_files/'

# SOFTWARE_DIR - directory where all software is stored
SOFTWARE_DIR = '/home/anand/Documents/HT_Expression/prediction_software/'

# Table of Contents

1. [Initialize Data](#Initialize-Data)
2. [Generate Features](#Generate-Features)
    1. [mRNA Properties](#mRNA-Properties)
    2. [Physical Properties](#Physical-Properties)
3. [Compile Features](#Compile-Features)

# Initialize Data 
[Back to Top](#Table-of-Contents)

In [3]:
prest_file = DATA_DIR+'prEST_conc_data.csv'
DF_raw_prest = pd.read_csv(prest_file,index_col=0)
print 'Number of entries:', len(DF_raw_prest)
DF_raw_prest.head()

Number of entries: 46521


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


Some prESTs were expressed in duplicate experiments. Merge these results so each sequence has a single concentration.

In [4]:
print 'Number of total entries:',len(DF_raw_prest)
print 'Number of unique prESTs:',len(DF_raw_prest.aa_seq.unique())

Number of total entries: 46521
Number of unique prESTs: 45206


In [5]:
dropped = []
DF_prest = DF_raw_prest.copy()
for name,group in DF_raw_prest.groupby('aa_seq'):
    if len(group)>1:
        DF_prest.loc[group.index[0],'conc_cf'] = group.conc_cf.mean()
        dropped += group.index[1:].tolist()
DF_prest = DF_prest.drop(dropped).reset_index(drop=True)

In [6]:
print 'Final number of prESTs:',len(DF_prest)
DF_prest.head()

Final number of prESTs: 45206


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


Save the dataframe

In [7]:
DF_prest.to_csv(DATA_DIR+'DF_prest.csv')

# Generate Features
[Back to Top](#Table-of-Contents)

In [9]:
DF_prest_features = pd.read_csv(DATA_DIR+'DF_prest.csv',index_col=0)

In [10]:
print 'Number of entries:',len(DF_prest_features)
DF_prest_features.head()

Number of entries: 45206


Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


## mRNA Properties
[Back to Top](#Table-of-Contents)

In [55]:
# Directory where MFold outputs are stored
MFOLD_DIR = MISC_DIR+'mfold_out/'

# Directory that contains the tAI software
TAI_DIR = '/home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/'

In [17]:
# Export fasta files for all mRNA sequences and their first 40 nts

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
    
sequences = []
sequences40 = []

for i,row in tqdm(DF_prest_features.iterrows()):

    name = FASTA_DIR+'nt_sequences/'+str(row.prest_id)+'.fasta'
    name40 = FASTA_DIR+'nt_sequences/'+str(row.prest_id)+'_40nt.fasta'
    
    # Export FASTA files for all mRNA sequences
    seq = SeqRecord(Seq(row.nt_seq).transcribe(), id=str(row.prest_id),description='prEST #'+str(row.prest_id))
    sequences.append(seq)
    with open(name,'w') as f:
        SeqIO.write(seq,f,'fasta')
        
    # Export FASTA files for first 40 nts of all mRNA sequences
    seq40 = SeqRecord(Seq(row.nt_seq).transcribe()[:40], id=str(row.prest_id),description='prEST #'+str(row.prest_id))
    sequences40.append(seq40)
    with open(name40,'w') as f:
        SeqIO.write(seq40,f,'fasta')
    
with open(FASTA_DIR+'all_nt.fasta','w') as f:
    SeqIO.write(sequences,f,'fasta')
with open(FASTA_DIR+'all_nt_40.fasta','w') as f:
    SeqIO.write(sequences40,f,'fasta')



### Folding Energy
[Back to Top](#Table-of-Contents)

Calculation of all folding energies takes 15-20 hours.

In [None]:
print 'Command: \'mfold SEQ=\''+FASTA_DIR+'nt_sequences/PREST_ID.fasta\' MAX=1\''

os.chdir(MFOLD_DIR)

energy = []
for prest_id in tqdm(DF_prest_features.prest_id):
    
    # Run mRNA folding energy prediction software on full mRNA
    if not os.path.isfile(MFOLD_DIR+str(prest_id)+'.ct'):
        command = 'mfold SEQ=\''+FASTA_DIR+'nt_sequences/'+str(prest_id)+'.fasta\' MAX=1'
        os.system(command)
    
    # Run mRNA folding energy prediction software on first 40 nucleotides
    if not os.path.isfile(MFOLD_DIR+str(prest_id)+'_40nt'+'.ct'):
        command = 'mfold SEQ=\''+FASTA_DIR+'nt_sequences/'+str(prest_id)+'_40nt.fasta\' MAX=1'
        os.system(command)
    
    # Remove non-essential files
    for f in os.listdir(MFOLD_DIR):
        if not f.endswith('.ct') or f.endswith('_1.ct'):
            os.remove(f)

  0%|          | 10/45206 [00:10<13:25:40,  1.07s/it]

In [7]:
for i,row in tqdm(DF_prest_features.iterrows()):
    # Parse results
    if os.path.isfile(MFOLD_DIR+str(row.prest_id)+'.ct'):
        with open(MFOLD_DIR+str(row.prest_id)+'.ct','r') as f:
            result = f.read()
            DF_prest_features.loc[i,'RNA_folding_energy'] = float(re.search('-\d*.\d*',result).group())
    if os.path.isfile(MFOLD_DIR+str(row.prest_id)+'_40nt.ct'):
        with open(MFOLD_DIR+str(row.prest_id)+'_40nt.ct','r') as f:
            result = f.read()
            DF_prest_features.loc[i,'RNA_40_energy'] = float(re.search('-\d*.\d*',result).group())



In [13]:
DF_prest_features.head()

Unnamed: 0,prest_id,uniprot_id,conc_cf,aa_seq,nt_seq,aa_len
0,140095,G3V3N0,4.3075,IMTAPSSFEQFKVAMNYLQLYNVPDCLEDIQDADCSSSKCSSSASS...,GACAAGCTTGCGGCCGCAATTATGACAGCTCCCTCCAGTTTTGAGC...,139
1,140099,G3V537,2.9154,TYYAWKHELLGSGTCPALPPREVLGMEELEKLPEEQVAEEELECSA...,GACAAGCTTGCGGCCGCAACCTACTATGCCTGGAAGCATGAGCTGC...,144
2,140225,P12724,1.4877,SLHARPPQFTRAQWFAIQHISLNPPRCTIAMRAINNYRWRCKNQNT...,GACAAGCTTGCGGCCGCATCACTCCATGCCAGACCCCCACAGTTTA...,136
3,140235,H0YH02,6.7224,ARALNESKRVNNGNTAPEDSSPAKKTRRCQRQESKKMPVAGGKANK...,GACAAGCTTGCGGCCGCAGCGAGAGCATTAAATGAAAGCAAAAGAG...,123
4,140309,F5GYC5,3.3848,HRKEPGARLEATRGAARPHKQGTKPMITRPSVSQLGEGKCPSSQHL...,GACAAGCTTGCGGCCGCACATCGGAAAGAGCCTGGGGCAAGGCTGG...,124


### tRNA adaptation index

In [69]:
command1 = ['perl', TAI_DIR+'codonM', FASTA_DIR+'all_nt.fasta',MISC_DIR+'tAI_files/prEST_tAI.m']
print 'Running Command:'+' '.join(command1) + '\n'

print subprocess.check_output(command1)

command2 = ['Rscript',MISC_DIR+'tAI_files/calc_tAI.R',MISC_DIR+'tAI_files/',TAI_DIR]
print 'Running Command:'+' '.join(command2) + '\n'

print subprocess.check_output(command2)

perl /home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/codonM /home/anand/Documents/HT_Expression/ExpressPro/fasta_files/all_nt.fasta /home/anand/Documents/HT_Expression/ExpressPro/misc_files/tAI_files/prEST_tAI.m

Analysing /home/anand/Documents/HT_Expression/ExpressPro/fasta_files/all_nt.fasta, please be patient ...
	DONE

Rscript /home/anand/Documents/HT_Expression/ExpressPro/misc_files/tAI_files/calc_tAI.R /home/anand/Documents/HT_Expression/ExpressPro/misc_files/tAI_files/ /home/anand/Documents/HT_Expression/prediction_software/tAI/codonR/

[1] "tAI calculated and stored in prEST_tAI.csv"

