In [1]:
import pandas as pd
from modlamp.descriptors import GlobalDescriptor

In [2]:
def get_mw(sequence):
    """Molecular Weight"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_MW(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_isoelectric_point(sequence):
    """Isoelectric point"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.isoelectric_point(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_charge_density(sequence):
    """Charge density"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.charge_density(ph=7, amide=True)
        return round(desc.descriptor[0][0], 5)
    except:
        return None

def get_charge(sequence):
    """Charge"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_charge(ph=7, amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_instability_index(sequence):
    """Instability index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.instability_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aromaticity(sequence):
    """Aromaticity"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aromaticity()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aliphatic_index(sequence):
    """Aliphatic index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aliphatic_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_boman_index(sequence):
    """Boman index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.boman_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_hydrophobic_ratio(sequence):
    """Hydrophobic ratio"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.hydrophobic_ratio()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

In [3]:
df_data = pd.read_csv("../sequence_processing/peptides_avp/antiviral.csv")
df_data

Unnamed: 0,sequence,Activity
0,KKKKVVEATYVLV,1
1,GLPVCGESCFGGSCYTPGCSCTWPICTRD,1
2,MQYKINMYAIVVYDVNVSRQNQIREFLRKYLYHVQRSVFEGEISPS...,1
3,KQEGRDHDKSKGHFHMIVIHHKGGQAHHG,1
4,LAHKSRLYERHM,1
...,...,...
86472,RRRRRRRRGNLWAAQRYGRELRRMSDEFVDSFKK,0
86473,VIGGDECNINEHRFLVALYDGLSGTFLCGG,0
86474,MAAHKSFRIKQKLAKKLKQNRSVPQWVRLATGNTIRYNAKRRHWRR...,0
86475,GIACLCDSDGPSVRGNTLSGTYWLAGCPSGWHNCKSSGQLIGACCKQ,0


In [4]:
df_data["Activity"].value_counts()

Activity
0    81300
1     5177
Name: count, dtype: int64

In [5]:
from sklearn.utils import shuffle

In [6]:
df_positivo = df_data[df_data["Activity"] == 1]
df_negativo = df_data[df_data["Activity"] == 0]

df_negativo = shuffle(df_negativo, n_samples=len(df_positivo), random_state=42)

df_concat = pd.concat([df_positivo, df_negativo], axis=0)
df_concat["Activity"].value_counts()

Activity
1    5177
0    5177
Name: count, dtype: int64

In [8]:
df_concat["MW"] = df_concat["sequence"].apply(get_mw)
df_concat["isoelectric_point"] = df_concat["sequence"].apply(get_isoelectric_point)
df_concat["aromaticity"] = df_concat["sequence"].apply(get_aromaticity)
df_concat["aliphatic_index"] = df_concat["sequence"].apply(get_aliphatic_index)
df_concat["boman_index"] = df_concat["sequence"].apply(get_boman_index)
df_concat["charge"] = df_concat["sequence"].apply(get_charge)
df_concat["charge_density"] = df_concat["sequence"].apply(get_charge_density)
df_concat["hydrophobic_ratio"] = df_concat["sequence"].apply(get_hydrophobic_ratio)
df_concat["instability_index"] = df_concat["sequence"].apply(get_instability_index)

df_concat.to_csv("estimated_properties.csv", index=False)