In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

import itertools
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
amino_acids = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [None]:
def get_List_of_Families():
    families = data.groupby(['family_name']).agg(['nunique']).reset_index(drop=False)
    families.columns = ['family_name', '#sequences']
    return families

In [None]:
count_freq = {}
    
for amino in amino_acids:
    count_freq[amino] = 0
        

In [None]:
def get_Frequencies(sequence):
    count_freq = {}
    
    for amino in amino_acids:
        count_freq[amino] = 0
    
    for amino in sequence:
        count_freq[amino] += 1
    
    return count_freq

In [None]:
data = pd.read_csv('Sequence_Data.csv')
data.head()

In [None]:
data = data.drop(['SeqNo'], axis = 1)
data.head()

In [None]:
data.info()

In [None]:
get_List_of_Families()

In [None]:
count = 0;
for index, row in data.iterrows():
    valid = True
    
    for amino in row['Sequence']:
        if (amino not in amino_acids):
            valid = False
            break
            
    if not valid:       
        data.drop(index, inplace=True)
        count += 1
        
print(f'{count} data dropped')
    

In [None]:
get_List_of_Families()

In [None]:
for index, row in data.iterrows():
    
    if (row['family_name'] == 'Complex I subunit 1 family' 
        or row['family_name'] == 'Complex I subunit 4 family'
        or row['family_name'] == 'Complex I subunit 5 family'):
            row['family_name'] = 'Complex I family'
    
    if (row['family_name'] == 'Cytochrome c oxidase subunit 2 family' 
        or row['family_name'] == 'Cytochrome c oxidase subunit 3 family'):
            row['family_name'] = 'Cytochrome c oxidase family'
        
    if (row['family_name'] == 'G-protein coupled receptor 1 family' 
        or row['family_name'] == 'G-protein coupled receptor 2 family'):
            row['family_name'] = 'G-protein coupled family'
    
    if (row['family_name'] == 'MHC class I family' 
        or row['family_name'] == 'MHC class II family'):
            row['family_name'] = 'MHC family'
        

get_List_of_Families()

In [None]:
noisy_families = []
families = get_List_of_Families()

for index, row in families.iterrows():
    if row['#sequences'] < 80 or row['#sequences'] > 2000:
        noisy_families.append(row['family_name'])
    
noisy_families

In [None]:
for index, row in data.iterrows():
    if row['family_name'] in noisy_families:
        data.drop(index, inplace=True)
        
get_List_of_Families()

In [None]:
print ('Viewing Database Information:\n\n')
data.info()

print ('\n\n\n\nViewing Database Description\n\n')
data.describe()

In [None]:
properties = {
    'Tag' : amino_acids,
    
    'Protein_Name': ['Alanine', 'Cysteine', 'Aspartic Acid', 'Glutamic Acid', 
                     'Phenylalanine', 'Glycine', 'Histidine', 'Isoleucine', 
                     'Lysine', 'Leucine', 'Methionine', 'Asparagine', 
                     'Proline', 'Glutamine', 'Arginine', 'Serine', 
                     'Threonine', 'Valine', 'Tryptophan', 'Tyrosine'],
    
    'Molecular_Weight': [89.1, 121.16, 133.11, 147.13, 165.19, 
                         75.07, 155.16, 131.18, 146.19, 131.18, 
                         149.21, 132.12, 115.13, 146.15, 174.2,
                         105.09, 119.12, 117.15, 204.23, 181.19],
    
    'IsoElectric_Point': [6, 5.07, 2.77, 3.22, 5.48,
                          5.97, 7.59, 6.02, 9.74, 5.98,
                          5.74, 5.41, 6.3, 5.65, 10.76, 
                          5.68, 5.6, 5.96, 5.89, 5.66],
    
    'Hydropathy_Property': ['Hydrophobic', 'Hydrophobic', 'Hydrophilic', 'Neutral', 
                            'Very Hydrophobic', 'Neutral', 'Hydrophilic', 'Very Hydrophobic', 
                            'Hydrophilic', 'Very Hydrophobic', 'Very Hydrophobic', 'Neutral', 
                            'Hydrophilic', 'Neutral', 'Hydrophilic', 'Neutral', 
                            'Neutral', 'Very Hydrophobic', 'Very Hydrophobic', 'Hydrophobic']
    }


properties = pd.DataFrame(properties)

properties


In [None]:
cols = ['family_name' , 'Sequence']
for amino in amino_acids:
    cols.append(amino)

data = pd.DataFrame(data, columns = cols)
labels = [col for col in data]


In [None]:

for index, row in data.iterrows():
    freq_count = get_Frequencies(row['Sequence']);
    list = [row['family_name'], row['Sequence']]
    
    for amino in labels[2:]:
        list.append(freq_count[amino])
    
    data.loc[index] = list
    
data.head()
    

In [None]:
list_PAMW = []
list_PAIW = []

for index, row in data.iterrows():
    pos_MW = 0
    pos_IW = 0
    pos = 0
    
    for amino in row['Sequence']:
        pos += 1
        
        pos_MW += properties.loc[properties['Tag'] == amino]['Molecular_Weight'].values * pos
        pos_IW += properties.loc[properties['Tag'] == amino]['IsoElectric_Point'].values * pos

    list_PAMW.append(pos_MW/len(row['Sequence']))
    list_PAIW.append(pos_IW/len(row['Sequence']))
   
