In [1]:
import os

import pandas as pd
import numpy as np

import itertools
from collections import Counter

from featureextraction import aminos, utils, startextraction



In [2]:
amino_acids = aminos.amino_acids
properties = aminos.properties

In [3]:
data = pd.read_csv('Sequence_Data.csv')
data.head()

Unnamed: 0,SeqNo,family_name,Sequence
0,1,G-protein coupled receptor 1 family,MALEQNQSTDYYYEENEMNGTYDYSQYELICIKEDVREFAKVFLPV...
1,2,Actin family,MSGGVYGGDEVGALVFDIGSFSVRAGYAGEDCPKADFPTTVGLLAA...
2,3,Ligand-gated ion channel (TC 1.A.9) family,MEGGWPARQSALLCLTVSLLLQGRGDAFTINCSGFDQHGVDPAVFQ...
3,4,G-protein coupled receptor 1 family,MPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNYFVV...
4,5,Mitochondrial carrier (TC 2.A.29) family,MHREPAKKKAEKRLFDASSFGKDLLAGGVAAAVSKTAVAPIERVKL...


In [4]:
data = data.drop(['SeqNo'], axis = 1)
data.head()

Unnamed: 0,family_name,Sequence
0,G-protein coupled receptor 1 family,MALEQNQSTDYYYEENEMNGTYDYSQYELICIKEDVREFAKVFLPV...
1,Actin family,MSGGVYGGDEVGALVFDIGSFSVRAGYAGEDCPKADFPTTVGLLAA...
2,Ligand-gated ion channel (TC 1.A.9) family,MEGGWPARQSALLCLTVSLLLQGRGDAFTINCSGFDQHGVDPAVFQ...
3,G-protein coupled receptor 1 family,MPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNYFVV...
4,Mitochondrial carrier (TC 2.A.29) family,MHREPAKKKAEKRLFDASSFGKDLLAGGVAAAVSKTAVAPIERVKL...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39408 entries, 0 to 39407
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   family_name  39408 non-null  object
 1   Sequence     39408 non-null  object
dtypes: object(2)
memory usage: 615.9+ KB


In [6]:
utils.get_List_of_Families(data)

Unnamed: 0,family_name,#sequences
0,AGC Ser/Thr protein kinase family,94
1,Actin family,105
2,Ammonium transporter (TC 2.A.49) family,337
3,Annexin family,67
4,Cation transport ATPase (P-type) (TC 3.A.3) fa...,130
5,Complex I subunit 1 family,32
6,Complex I subunit 4 family,643
7,Complex I subunit 5 family,1553
8,Cyclic nucleotide phosphodiesterase family,96
9,Cyclin family,104


In [7]:
count = 0;
for index, row in data.iterrows():
    valid = True
    
    for amino in row['Sequence']:
        if (amino not in amino_acids):
            valid = False
            break
            
    if not valid:       
        data.drop(index, inplace=True)
        count += 1
        
print(f'{count} data dropped')
    

2304 data dropped


In [8]:
utils.get_List_of_Families(data)

Unnamed: 0,family_name,#sequences
0,AGC Ser/Thr protein kinase family,93
1,Actin family,102
2,Ammonium transporter (TC 2.A.49) family,335
3,Annexin family,66
4,Cation transport ATPase (P-type) (TC 3.A.3) fa...,122
5,Complex I subunit 1 family,32
6,Complex I subunit 4 family,345
7,Complex I subunit 5 family,1153
8,Cyclic nucleotide phosphodiesterase family,93
9,Cyclin family,101


In [9]:
for index, row in data.iterrows():
    
    if (row['family_name'] == 'Complex I subunit 1 family' 
        or row['family_name'] == 'Complex I subunit 4 family'
        or row['family_name'] == 'Complex I subunit 5 family'):
            row['family_name'] = 'Complex I family'
    
    if (row['family_name'] == 'Cytochrome c oxidase subunit 2 family' 
        or row['family_name'] == 'Cytochrome c oxidase subunit 3 family'):
            row['family_name'] = 'Cytochrome c oxidase family'
        
    if (row['family_name'] == 'G-protein coupled receptor 1 family' 
        or row['family_name'] == 'G-protein coupled receptor 2 family'):
            row['family_name'] = 'G-protein coupled family'
    
    if (row['family_name'] == 'MHC class I family' 
        or row['family_name'] == 'MHC class II family'):
            row['family_name'] = 'MHC family'
        

utils.get_List_of_Families(data)

Unnamed: 0,family_name,#sequences
0,AGC Ser/Thr protein kinase family,93
1,Actin family,102
2,Ammonium transporter (TC 2.A.49) family,335
3,Annexin family,66
4,Cation transport ATPase (P-type) (TC 3.A.3) fa...,122
5,Complex I family,1530
6,Cyclic nucleotide phosphodiesterase family,93
7,Cyclin family,101
8,Cytochrome P450 family,234
9,Cytochrome b family,1391


In [10]:
noisy_families = []
families = utils.get_List_of_Families(data)

for index, row in families.iterrows():
    if row['#sequences'] < 80 or row['#sequences'] > 2000:
        noisy_families.append(row['family_name'])
    
noisy_families

['Annexin family', 'ETS family', 'Globin family', 'MHC family']

In [11]:
for index, row in data.iterrows():
    if row['family_name'] in noisy_families:
        data.drop(index, inplace=True)
        
utils.get_List_of_Families(data)

Unnamed: 0,family_name,#sequences
0,AGC Ser/Thr protein kinase family,93
1,Actin family,102
2,Ammonium transporter (TC 2.A.49) family,335
3,Cation transport ATPase (P-type) (TC 3.A.3) fa...,122
4,Complex I family,1530
5,Cyclic nucleotide phosphodiesterase family,93
6,Cyclin family,101
7,Cytochrome P450 family,234
8,Cytochrome b family,1391
9,Cytochrome c oxidase family,666


In [12]:
print ('Viewing Database Information:\n\n')
data.info()

print ('\n\n\n\nViewing Database Description\n\n')
data.describe()

Viewing Database Information:


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11414 entries, 0 to 39407
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   family_name  11414 non-null  object
 1   Sequence     11414 non-null  object
dtypes: object(2)
memory usage: 525.6+ KB




Viewing Database Description




Unnamed: 0,family_name,Sequence
count,11414,11414
unique,34,10899
top,G-protein coupled family,MTHQSHAYHMVKPSPWPLTGALSALLMTSGLAMWFHFHSMTLLMLG...
freq,2117,3


In [13]:
properties

Unnamed: 0,Tag,Protein_Name,Molecular_Weight,IsoElectric_Point,Hydropathy_Property,Hydropathy_Label,6_Letter_Encoding
0,A,Alanine,89.1,6.0,Hydrophobic,-1,e4
1,C,Cysteine,121.16,5.07,Hydrophobic,-1,e3
2,D,Aspartic Acid,133.11,2.77,Hydrophilic,1,e2
3,E,Glutamic Acid,147.13,3.22,Neutral,0,e2
4,F,Phenylalanine,165.19,5.48,Very Hydrophobic,-2,e6
5,G,Glycine,75.07,5.97,Neutral,0,e4
6,H,Histidine,155.16,7.59,Hydrophilic,1,e1
7,I,Isoleucine,131.18,6.02,Very Hydrophobic,-2,e5
8,K,Lysine,146.19,9.74,Hydrophilic,1,e1
9,L,Leucine,131.18,5.98,Very Hydrophobic,-2,e5


In [14]:
cols = ['family_name' , 'Sequence']
for amino in amino_acids:
    cols.append(amino)

data = pd.DataFrame(data, columns = cols)
labels = [col for col in data]


In [15]:

for index, row in data.iterrows():
    freq_count = utils.get_Frequencies(row['Sequence']);
    list = [row['family_name'], row['Sequence']]
    
    for amino in labels[2:]:
        list.append(freq_count[amino])
    
    data.loc[index] = list
    
data.head()
    

Unnamed: 0,family_name,Sequence,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,G-protein coupled family,MALEQNQSTDYYYEENEMNGTYDYSQYELICIKEDVREFAKVFLPV...,22.0,12.0,10.0,13.0,22.0,14.0,2.0,35.0,...,15.0,14.0,15.0,12.0,12.0,22.0,20.0,34.0,5.0,20.0
1,Actin family,MSGGVYGGDEVGALVFDIGSFSVRAGYAGEDCPKADFPTTVGLLAA...,28.0,8.0,21.0,31.0,16.0,41.0,10.0,23.0,...,16.0,16.0,26.0,17.0,16.0,28.0,23.0,28.0,6.0,13.0
2,Ligand-gated ion channel (TC 1.A.9) family,MEGGWPARQSALLCLTVSLLLQGRGDAFTINCSGFDQHGVDPAVFQ...,23.0,9.0,21.0,17.0,26.0,27.0,6.0,25.0,...,16.0,19.0,29.0,16.0,16.0,32.0,33.0,27.0,11.0,13.0
3,G-protein coupled family,MPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNYFVV...,42.0,15.0,9.0,15.0,20.0,34.0,11.0,29.0,...,8.0,18.0,24.0,16.0,19.0,26.0,14.0,35.0,7.0,11.0
4,Mitochondrial carrier (TC 2.A.29) family,MHREPAKKKAEKRLFDASSFGKDLLAGGVAAAVSKTAVAPIERVKL...,28.0,5.0,13.0,11.0,25.0,32.0,3.0,17.0,...,7.0,6.0,11.0,14.0,20.0,20.0,11.0,24.0,3.0,12.0


In [16]:
#mac users
path_mac = f'{os.getcwd()}/cleaned_data.csv'

#windows users
path = f'{os.getcwd()}\\cleaned_data.csv'

data.to_csv(path_mac, index = False)

#### Option definition
All Features = 'a'
Behavioural Features = 'b'
Structural Features = 's'

extractFeatures(int num_rows = 0, int min_n_gram_value = 2, int max_n_gram_value = 5, str option = 'a')

In [None]:
extracted_features = startextraction.extractFeatures()
extracted_features = pd.DataFrame(extracted_features)
extracted_features.head()

In [None]:
extracted_features.head()

In [None]:
#mac users
path_mac = f'{os.getcwd()}/extracted_data.csv'

#windows users
path_windows = f'{os.getcwd()}\\extracted_data.csv'

extracted_features.to_csv(path_mac, index = False)