## Generate features for each protein sequence
With each protein sequence generated and compiled using Bio.Entrez, the following script will generate corresponding features to train the prediction model and visualization of each protein's gene expression. 

After this, WIDI's MODEL.

In [5]:
# import relevant libraries

import Bio 
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd

In [12]:
# test Biopython functionality
ProtA = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGT"
                    "RDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEEC"
                    "LFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILF"
                    "LPLPV")

MW = ProtA.molecular_weight()
count_AA = ProtA.count_amino_acids()
arom = ProtA.aromaticity()
iso_el = ProtA.isoelectric_point()

print(MW)
print(count_AA)
print(arom)
print(iso_el)

17103.1617
{'A': 6, 'C': 3, 'D': 5, 'E': 12, 'F': 6, 'G': 14, 'H': 5, 'I': 5, 'K': 12, 'L': 18, 'M': 2, 'N': 7, 'P': 8, 'Q': 6, 'R': 6, 'S': 10, 'T': 13, 'V': 5, 'W': 1, 'Y': 8}
0.09868421052631579
7.7224523544311525


PSEUDOCODE: 

take 1 protein sequence from dataframe (1 column, iterate through rows):
    for each protein sequence
        run MW
        run count AA
        run arom
        run iso_el
            output to new columns

In [6]:
# generate overall structure for features generation

d = pd.read_csv('compiled_features.csv')
df = pd.DataFrame(d)

MW_features = []
count_features = []
arom_features = []
iso_features = []

for i in df['SEQUENCE'].values:
    ProtA = ProteinAnalysis(i)

    MW = ProtA.molecular_weight()
    MW_features.append(MW)
    
    #count_AA = ProtA.count_amino_acids()
    #count_features.append(count_AA)
    
    arom = ProtA.aromaticity()
    arom_features.append(arom)
    
    iso_e = ProtA.isoelectric_point()
    iso_features.append(iso_e)
    
df['MW'] = MW_features
#df['COUNT_AA'] = count_features
df['AROM'] = arom_features
df['ISO_E'] = iso_features

In [7]:
def create_BioPy_features(df):
    """
    Taken-in dataframe with a column specified as 'SEQUENCE' and uses Biopython to 
    generate additional features from each protein sequence to help better train a machine
    learning model to predict log2FC
    """
    
    # initiate results lists
    MW_features = []
    count_features = []
    arom_features = []
    iso_features = []
    
    # Determine features for each element of 'SEQUENCE' column of dataframe
    for i in df['SEQUENCE'].values:
        ProtA = ProteinAnalysis(i)

        MW = ProtA.molecular_weight()
        MW_features.append(MW)

        arom = ProtA.aromaticity()
        arom_features.append(arom)

        iso_e = ProtA.isoelectric_point()
        iso_features.append(iso_e)
        
    df['MW'] = MW_features
    df['AROM'] = arom_features
    df['ISO_E'] = iso_features
    
    return df

In [8]:
# testing new function

d = pd.read_csv('compiled_features.csv')
df = pd.DataFrame(d)

new_df = create_BioPy_features(df)
new_df

Unnamed: 0.1,Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID,SEQUENCE,AA_NP,AA_POS,AA_POL,AA_NEG,MW,AROM,ISO_E
0,0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384,mkaltssllglfaapvlagllgayvplasaappkeiriavpdvsag...,59.773371,11.614731,17.847025,10.764873,37193.0756,0.070822,6.871061
1,1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400,mhqriasiglgltlalggsaqaagqlnvvswsgyfspqllekfeke...,54.941860,11.337209,22.674419,11.046512,37859.8560,0.093023,6.033937
2,2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520,mtyrtpltllfaaglalggqaraegtlhfanwsdyyppellkkfek...,52.449568,14.121037,20.461095,12.968300,38901.0268,0.112392,6.350359
3,3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799,mlpamrtgllcallgvtapawaeyvtvisfggankeaqetafykpf...,59.593023,11.918605,17.732558,10.755814,37832.8132,0.116279,6.919602
4,4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801,mskslkaaslkfatlaaglacaaqamavdltvvsfgganksaqika...,54.310345,11.494253,22.413793,11.781609,38166.9526,0.112069,5.869103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,431,RNA_polymerase,BS,939937,NP_388354,-0.055848,728883360,mtqpskttkltkdevdrlisdyqtkqdeqaqetlvrvytnlvdmla...,42.748092,16.030534,24.809160,16.412214,29900.8971,0.049618,5.545803
432,432,RNA_polymerase,BS,936362,NP_391300,-0.140629,728886343,mdmklqqvqvlkpqltqelrqaitllgyhsaelaeyidelslenpl...,43.807339,15.596330,27.752294,12.844037,49700.0669,0.064220,7.716086
433,433,RNA_polymerase,BS,938729,NP_390226,-0.528350,728885268,mdvevkkngknaqlkdhevkelikqsqngdqqardllieknmrlvw...,43.137255,16.862745,21.568627,18.431373,29372.0675,0.058824,5.252059
434,434,RNA_polymerase,BS,939953,NP_389416,0.303125,728884442,msrnkveicgvdtsklpvlkneemrklfrqlqdegddsareklvng...,45.384615,16.153846,21.923077,16.538462,30073.0463,0.061538,5.632312


In [13]:
df.head()
df.to_csv('compiled_features_complete.csv')

df

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID,SEQUENCE,AA_NP,AA_POS,AA_POL,AA_NEG,MW,AROM,ISO_E
0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384,mkaltssllglfaapvlagllgayvplasaappkeiriavpdvsag...,59.773371,11.614731,17.847025,10.764873,37193.0756,0.070822,6.871061
1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400,mhqriasiglgltlalggsaqaagqlnvvswsgyfspqllekfeke...,54.941860,11.337209,22.674419,11.046512,37859.8560,0.093023,6.033937
2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520,mtyrtpltllfaaglalggqaraegtlhfanwsdyyppellkkfek...,52.449568,14.121037,20.461095,12.968300,38901.0268,0.112392,6.350359
3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799,mlpamrtgllcallgvtapawaeyvtvisfggankeaqetafykpf...,59.593023,11.918605,17.732558,10.755814,37832.8132,0.116279,6.919602
4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801,mskslkaaslkfatlaaglacaaqamavdltvvsfgganksaqika...,54.310345,11.494253,22.413793,11.781609,38166.9526,0.112069,5.869103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,RNA_polymerase,BS,939937,NP_388354,-0.055848,728883360,mtqpskttkltkdevdrlisdyqtkqdeqaqetlvrvytnlvdmla...,42.748092,16.030534,24.809160,16.412214,29900.8971,0.049618,5.545803
432,RNA_polymerase,BS,936362,NP_391300,-0.140629,728886343,mdmklqqvqvlkpqltqelrqaitllgyhsaelaeyidelslenpl...,43.807339,15.596330,27.752294,12.844037,49700.0669,0.064220,7.716086
433,RNA_polymerase,BS,938729,NP_390226,-0.528350,728885268,mdvevkkngknaqlkdhevkelikqsqngdqqardllieknmrlvw...,43.137255,16.862745,21.568627,18.431373,29372.0675,0.058824,5.252059
434,RNA_polymerase,BS,939953,NP_389416,0.303125,728884442,msrnkveicgvdtsklpvlkneemrklfrqlqdegddsareklvng...,45.384615,16.153846,21.923077,16.538462,30073.0463,0.061538,5.632312


In [43]:
df.iloc[0]

PROT_SEQ    MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTV...
Name: 0, dtype: object

In [44]:
df['PROT_SEQ']

0    MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTV...
Name: PROT_SEQ, dtype: object

In [45]:
df.values

array([['MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV']],
      dtype=object)