In [2]:
import pandas as pd
from modlamp.descriptors import GlobalDescriptor

In [5]:
LIST_RESIDUES = [
    'A', 
    'C', 
    'D', 
    'E', 
    'F', 
    'G', 
    'H', 
    'I', 
    'N', 
    'K', 
    'L', 
    'M', 
    'P', 
    'Q', 
    'R', 
    'S', 
    'T', 
    'V', 
    'W', 
    'Y'
]

In [3]:
def get_mw(sequence):
    """Molecular Weight"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_MW(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_isoelectric_point(sequence):
    """Isoelectric point"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.isoelectric_point(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_charge_density(sequence):
    """Charge density"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.charge_density(ph=7, amide=True)
        return round(desc.descriptor[0][0], 5)
    except:
        return None

def get_charge(sequence):
    """Charge"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_charge(ph=7, amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_instability_index(sequence):
    """Instability index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.instability_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aromaticity(sequence):
    """Aromaticity"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aromaticity()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aliphatic_index(sequence):
    """Aliphatic index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aliphatic_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_boman_index(sequence):
    """Boman index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.boman_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_hydrophobic_ratio(sequence):
    """Hydrophobic ratio"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.hydrophobic_ratio()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

In [4]:
def get_frequency_residue(sequence, residue):
    return round(sequence.count(residue)/len(sequence), ndigits=4)

In [6]:
list_df = []
for plastic in ['PET', 'PHB', 'PHA', 'PLA', 'PCL', 'PU_PUR','NYLON_PA', 'PBAT']:

   df_data = pd.read_csv(f"../../results/generated_dataset/{plastic}/processed_data.csv")
   df_data = df_data[df_data["label"] == 1]

   df_data["Molecular Weight"] = df_data["sequence"].apply(get_mw)
   df_data["Isoelectric point"] = df_data["sequence"].apply(get_isoelectric_point)
   df_data["Aromaticity"] = df_data["sequence"].apply(get_aromaticity)
   df_data["Aliphatic index"] = df_data["sequence"].apply(get_aliphatic_index)
   df_data["Boman index"] = df_data["sequence"].apply(get_boman_index)
   df_data["Charge"] = df_data["sequence"].apply(get_charge)
   df_data["Charge density"] = df_data["sequence"].apply(get_charge_density)
   df_data["Hydrophobic ratio"] = df_data["sequence"].apply(get_hydrophobic_ratio)
   df_data["Instability index"] = df_data["sequence"].apply(get_instability_index)
   df_data["Type Plastic"] = plastic.replace("_", "/")
   
   for residue in LIST_RESIDUES:
      df_data[residue] = df_data["sequence"].apply(lambda x : get_frequency_residue(x, residue))
   list_df.append(df_data)

In [10]:
df_processed = pd.concat(list_df, axis=0)
df_processed.to_csv("../../results/characterized_dataset/dataset_characterized.csv", index=False)