# Amino Acid Grouping by Gene Type

In [1]:
# Imports

import math
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv('compiled_features_complete.csv')
data

Unnamed: 0.1,Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID,SEQUENCE,AA_NP,AA_POS,AA_POL,AA_NEG,MW,AROM,ISO_E
0,0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384,mkaltssllglfaapvlagllgayvplasaappkeiriavpdvsag...,59.773371,11.614731,17.847025,10.764873,37193.0756,0.070822,6.871061
1,1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400,mhqriasiglgltlalggsaqaagqlnvvswsgyfspqllekfeke...,54.941860,11.337209,22.674419,11.046512,37859.8560,0.093023,6.033937
2,2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520,mtyrtpltllfaaglalggqaraegtlhfanwsdyyppellkkfek...,52.449568,14.121037,20.461095,12.968300,38901.0268,0.112392,6.350359
3,3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799,mlpamrtgllcallgvtapawaeyvtvisfggankeaqetafykpf...,59.593023,11.918605,17.732558,10.755814,37832.8132,0.116279,6.919602
4,4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801,mskslkaaslkfatlaaglacaaqamavdltvvsfgganksaqika...,54.310345,11.494253,22.413793,11.781609,38166.9526,0.112069,5.869103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,431,RNA_polymerase,BS,939937,NP_388354,-0.055848,728883360,mtqpskttkltkdevdrlisdyqtkqdeqaqetlvrvytnlvdmla...,42.748092,16.030534,24.809160,16.412214,29900.8971,0.049618,5.545803
432,432,RNA_polymerase,BS,936362,NP_391300,-0.140629,728886343,mdmklqqvqvlkpqltqelrqaitllgyhsaelaeyidelslenpl...,43.807339,15.596330,27.752294,12.844037,49700.0669,0.064220,7.716086
433,433,RNA_polymerase,BS,938729,NP_390226,-0.528350,728885268,mdvevkkngknaqlkdhevkelikqsqngdqqardllieknmrlvw...,43.137255,16.862745,21.568627,18.431373,29372.0675,0.058824,5.252059
434,434,RNA_polymerase,BS,939953,NP_389416,0.303125,728884442,msrnkveicgvdtsklpvlkneemrklfrqlqdegddsareklvng...,45.384615,16.153846,21.923077,16.538462,30073.0463,0.061538,5.632312


In [9]:
# Get column names
column_names = list(data.columns.values)

### Sort data by gene type
Types of genes are:
- ABC_transporter
- cytochrome
- DNA_polymerase
- efflux_transporter
- electron_transfer
- flagellar_motility
- NADH_NADPH
- RNA_polymerase

In [15]:
# Initialize dataframe

ABC_transporter_df = pd.DataFrame(columns=column_names)
cytochrome_df = pd.DataFrame(columns=column_names)
DNA_polymerase_df = pd.DataFrame(columns=column_names)
RNA_polymerase_df = pd.DataFrame(columns=column_names)
efflux_transporter_df = pd.DataFrame(columns=column_names)
electron_transfer_df = pd.DataFrame(columns=column_names)
flagellar_motility_df = pd.DataFrame(columns=column_names)
NADH_NADPH_df = pd.DataFrame(columns=column_names)

In [19]:
# Sort rows in data dataframe by gene name
for i in range(len(data)):
    if data['GENENAME'][i] == 'ABC_transporter':
        ABC_transporter_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'cytochrome':
        cytochrome_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'DNA_polymerase':
        DNA_polymerase_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'RNA_polymerase':
        RNA_polymerase_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'efflux_transporter':
        efflux_transporter_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'electron_transfer':
        electron_transfer_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'flagellar_motility':
        flagellar_motility_df.append(data.loc[i], ignore_index=True)
    elif data['GENENAME'][i] == 'NADH_NADPH':
        NADH_NADPH_df.append(data.loc[i], ignore_index=True)

In [29]:
ABC_transporter_df

Unnamed: 0.1,Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID,SEQUENCE,AA_NP,AA_POS,AA_POL,AA_NEG,MW,AROM,ISO_E


In [20]:
# List of dataframes
dfs = [ABC_transporter_df, cytochrome_df, DNA_polymerase_df, RNA_polymerase_df, 
       efflux_transporter_df, electron_transfer_df, flagellar_motility_df, NADH_NADPH_df]

In [27]:
# Determine average amino acid group percentages per gene type

nonpolar = []
positive = []
polar = []
negative = []

for i in range(len(dfs)):
    dat_frm = dfs[i]
    nonpolar.append(dat_frm['AA_NP'].mean())
    positive.append(dat_frm['AA_POS'].mean())
    polar.append(dat_frm['AA_POL'].mean())
    negative.append(dat_frm['AA_NEG'].mean())

In [28]:
dfs[i]['AA_NP']

Series([], Name: AA_NP, dtype: object)

### Visualization function from Amino_Group_Scoring_JK notebook

In [22]:
def viz_aa_groups(nonpolar, positive, polar, negative):
    """
    Visualization for the amino acid groups in a sequence.  Inputs should be as a percentage.
    """
    
    labels = 'Nonpolar', 'Positive', 'Polar', 'Negative'
    sizes = [nonpolar, positive, polar, negative]
    fig, ax = plt.subplots()
    ax.pie(sizes, labels=labels, autopct='%1.0f%%', colors=['cornflowerblue', 'mediumaquamarine', 
                                                            'yellowgreen', 'palegoldenrod'])
    ax.axis('equal')
    plt.rcParams.update({'font.size': 14})
    plt.show()
    
    return fig

In [23]:
for i in range(len(dfs)):
    np = nonpolar[i]
    pos = positive[i]
    pol = polar[i]
    neg = negative[i]
    
    viz_aa_groups(np, pos, pol, neg)

ValueError: cannot convert float NaN to integer

posx and posy should be finite values
posx and posy should be finite values


ValueError: need at least one array to concatenate

<Figure size 432x288 with 1 Axes>