# Create Feature Matrix
This is a shortened version of the feature construction notebook designed for the limited solubility dataset.

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib as plt
import numpy as np
import os
import sys
import re
import subprocess
import itertools
from tqdm import tqdm
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [2]:
# ROOT_DIR - root directory
ROOT_DIR = os.getcwd()+'/'

# Table of Contents

1. [Initialize Data](#Initialize-Data)
2. [Generate Features](#Generate-Features)
    1. [Physical Properties](#Physical-Properties)


# Initialize Data 
[Back to Top](#Table-of-Contents)

In [3]:
prest_file = ROOT_DIR+'DF_solubility.csv'
DF_raw_prest = pd.read_csv(prest_file,index_col=0)
print 'Number of entries:', len(DF_raw_prest)
DF_raw_prest.head()

Number of entries: 16082


Unnamed: 0,pf_cultivation_id,prest_id,aa_seq,solubility_class_1M
0,255,230050,QLKGRDLLTLKNFTGEEIKYMLWLSADLKFRIKQKGEYLPLLQGKS...,3.0
1,263,230548,LPYAMKPIDYYTETKILQERAVLGANDPEKNFLTTAIRPHGIFGPR...,2.0
2,364,231390,IAELGVPLSQVKSISGTAQDGNTEPLPPDSGDKNLVIQKSKDEAQD...,5.0
3,441,230501,SDSDVGSGGIRPKQPRMLQENTRMDMENEESMMSYEGDGGEASHGL...,5.0
4,521,140193,HPAYNPKNFSNDIMLLQLERKAKWTTAVRPLRLPSSKAQVKPGQLC...,4.5


# Generate Features
[Back to Top](#Table-of-Contents)

## Physical Properties
[Back to Top](#Table-of-Contents)

In [4]:
DF_physical_features = pd.read_csv(ROOT_DIR+'DF_solubility.csv',index_col=0)

The BioPython ProtParam quickly calculates many physical properties for any peptide

In [5]:
from Bio.SeqUtils import ProtParam

mw = []
pI= []
arom = []
instab = []
gravy = []
for seq in tqdm(DF_physical_features.aa_seq):
    data = ProtParam.ProteinAnalysis(seq)
    mw.append(data.molecular_weight())
    pI.append(data.isoelectric_point())
    arom.append(data.aromaticity())
    instab.append(data.instability_index())
    gravy.append(data.gravy())
    
DF_physical_features['bio_pI'] = pI
DF_physical_features['bio_mW'] = mw
DF_physical_features['bio_aromaticity'] = arom
DF_physical_features['bio_instability'] = instab
DF_physical_features['bio_gravy'] = gravy

100%|██████████| 16082/16082 [00:05<00:00, 3190.63it/s]


Get the counts of each amino acid

In [6]:
amino_acids = 'ARNDCQEGHILKMFPSTWYV'
counts = [[np.true_divide(seq.count(aa),len(seq)) for seq in DF_physical_features.aa_seq] for aa in amino_acids]
for i in range(len(amino_acids)):
    DF_physical_features['list_comp_'+amino_acids[i]] = counts[i]

Calculate the charge of each prEST

In [7]:
DF_physical_features['charge'] = [seq.count('K')+seq.count('R')-seq.count('D')-seq.count('E') for seq in DF_physical_features.aa_seq]
DF_physical_features['abs_charge']=abs(DF_physical_features['charge'])
DF_physical_features['avg_charge']=np.true_divide(DF_physical_features['charge'],[len(seq) for seq in DF_physical_features['aa_seq']])
DF_physical_features['abs_avg_charge'] = abs(DF_physical_features['avg_charge'])

Calculate the presence of various amino acid types

In [8]:
DF_physical_features['frac_aliphatic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'AGILPV'])
DF_physical_features['frac_aromatic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'FWY'])
DF_physical_features['frac_uncharged_polar'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'STNQ'])
DF_physical_features['frac_polar'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'QNHSTYCMW'])
DF_physical_features['frac_hydrophobic'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'AGILPVF'])
DF_physical_features['frac_positive'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'HKR'])
DF_physical_features['frac_sulfur'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'CM'])
DF_physical_features['frac_negative'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'DE'])
DF_physical_features['frac_amide'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'NQ'])
DF_physical_features['frac_alcohol'] = sum([DF_physical_features['list_comp_'+aa] for aa in 'ST'])

Get length of peptides

In [9]:
DF_physical_features['aa_len'] = [len(seq) for seq in DF_physical_features.aa_seq]

In [10]:
DF_physical_features.to_csv(ROOT_DIR+'DF_solubility_features.csv')