In [1]:
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import utils
import os

In [2]:
path = os.getcwd()
pdbs = ['APD3', 'CancerPPD', 'DBBASP', 'FermFooDb', 'MAHMI', 'PepBDB', 'THPdb']

In [3]:
dfs = {}
for pdb in pdbs:
    dfs[pdb] = pd.read_csv(f'{path}/PDBs/{pdb}.csv')

In [4]:
count_id = 1
data = {
    'LBQCPDB ID': [],
    'Source ID': [],
    'Sequence': [],
    'Activity': [],
    'Source PDB': [],
    'DOI': []
}

In [5]:
# APD3
for i in tqdm(range(0, len(dfs['APD3']))):
    line = dfs['APD3'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['APD ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append(line['Activity'])
    data['Source PDB'].append('APD3')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|████████████████| 3579/3579 [00:00<00:00, 7129.70it/s]


In [6]:
# CancerPPD
for i in tqdm(range(0, len(dfs['CancerPPD']))):
    line = dfs['CancerPPD'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append('Not Found')
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append(line['NATURE'])
    data['Source PDB'].append('CancerPPD')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|████████████████| 5068/5068 [00:00<00:00, 7843.64it/s]


In [7]:
# DBBASP
for i in tqdm(range(0, len(dfs['DBBASP']))):
    line = dfs['DBBASP'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('DBBASP')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|██████████████| 20420/20420 [00:02<00:00, 7452.76it/s]


In [8]:
# FermFooDb
for i in tqdm(range(0, len(dfs['FermFooDb']))):
    line = dfs['FermFooDb'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['Link'].split('=')[1])
    data['Sequence'].append(line['Peptide Sequence'])
    data['Activity'].append(line['Activity'])
    data['Source PDB'].append('FermFooDb')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|████████████████| 2205/2205 [00:00<00:00, 4851.46it/s]


In [9]:
# MAHMI
for i in tqdm(range(0, len(dfs['MAHMI']))):
    line = dfs['MAHMI'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append(line['BIOACTIVITY'])
    data['Source PDB'].append('MAHMI')
    data['DOI'].append('Not Found')
    count_id += 1

100%|████████████| 276400/276400 [00:37<00:00, 7362.56it/s]


In [10]:
# PepBDB
for i in tqdm(range(0, len(dfs['PepBDB']))):
    line = dfs['PepBDB'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['Peptide ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('PepBDB')
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████████| 13301/13301 [00:02<00:00, 6475.14it/s]


In [11]:
# THPdb
for i in tqdm(range(0, len(dfs['THPdb']))):
    line = dfs['THPdb'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('THPdb')
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████████████| 852/852 [00:00<00:00, 6982.35it/s]


In [12]:
df = pl.DataFrame(data, nan_to_null=True)
#df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [13]:
# Count of Peptides by Database
for pdb in pdbs:
    print(pdb, len(df.filter(pl.col('Source PDB') == pdb)))

APD3 3579
CancerPPD 5068
DBBASP 20420
FermFooDb 2205
MAHMI 276400
PepBDB 13301
THPdb 852


In [14]:
replaces = {
    ' & Gram-': ', Anti-Gram-',
    ';': ',',
    'and ': ',',
    'CasoxinA': 'Casoxin A',
    ',,': ',',
    'Anatgonist': 'Antagonist',
    'Antimicrobial': 'Anti-microbial',
    'Antihypertensive': 'Anti-hypertensive'
}

geral_atv = []

def fix_activities(activity):
    # Call function 'multi_replace' from utils.py
    atv = utils.multi_replace(activity, replaces)
    
    # Resolve activites relates from attenuate, anti-microbial and antibacterial
    if 'Attenuate' in line or 'against' in line:
        atv = line[0].upper() + line[1:].replace(',', ', ')
        if not atv in geral_atv:
            geral_atv.append(atv)
        return list(atv)
        
    # Resolve other activities
    list_atv = []
    for line_atv in atv.split(','):
        line_atv = line_atv.strip()
        if len(line_atv) != 0:
            line_atv = line_atv[0].upper() + line_atv[1:] 
            list_atv.append(line_atv)
            if not line_atv in geral_atv:
                geral_atv.append(line_atv)
    
    return list_atv

In [15]:
# Fix Activity column in DataFrame
activities = df.get_column('Activity').apply(fix_activities)
df.replace('Activity', activities)

LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI
str,str,str,list[str],str,str
"""LBQCPDB_000001...","""AP00001""","""GLWSKIKEVGKEAA...","[""Anti-Gram+"", ""Anti-Gram-"", ... ""Anticancer""]","""APD3""","""Not Found"""
"""LBQCPDB_000002...","""AP00002""","""YVPLPNVPQPGRRP...","[""Anti-Gram+"", ""Anti-Gram-""]","""APD3""","""Not Found"""
"""LBQCPDB_000003...","""AP00003""","""DGVKLCDVPSGTWS...","[""Antifungal""]","""APD3""","""Not Found"""
"""LBQCPDB_000004...","""AP00004""","""NLCERASLTWTGNC...","[""Anti-Gram+"", ""Antifungal""]","""APD3""","""Not Found"""
"""LBQCPDB_000005...","""AP00005""","""VFIDILDKVENAIH...","[""Anti-Gram+""]","""APD3""","""Not Found"""
"""LBQCPDB_000006...","""AP00006""","""GNNRPVYIPQPRPP...","[""Anti-Gram+"", ""Anti-Gram-"", ""Anti-sepsis""]","""APD3""","""Not Found"""
"""LBQCPDB_000007...","""AP00007""","""GNNRPVYIPQPRPP...","[""Anti-Gram-""]","""APD3""","""Not Found"""
"""LBQCPDB_000008...","""AP00008""","""RLCRIVVIRVCR""","[""Anti-Gram+"", ""Anti-Gram-"", ... ""Wound healing""]","""APD3""","""Not Found"""
"""LBQCPDB_000009...","""AP00009""","""RFRPPIRRPPIRPP...","[""Anti-Gram-""]","""APD3""","""Not Found"""
"""LBQCPDB_000010...","""AP00010""","""RRIRPRPPRLPRPR...","[""Anti-Gram-"", ""Chemotactic"", ""Anti-sepsis""]","""APD3""","""Not Found"""


In [16]:
geral_atv = sorted(geral_atv)

data_atv = {
    'Activity': geral_atv,
    'APD3': np.full(len(geral_atv), 0),
    'CancerPPD': np.full(len(geral_atv), 0),
    'DBBASP': np.full(len(geral_atv), 0),
    'FermFooDb': np.full(len(geral_atv), 0),
    'MAHMI': np.full(len(geral_atv), 0),
    'PepBDB': np.full(len(geral_atv), 0),
    'THPdb': np.full(len(geral_atv), 0)
}

for row in df.rows(named=True):
    if row['Activity'] != None:
        for atv in row['Activity']:
            pos = data_atv['Activity'].index(atv)
            data_atv[row['Source PDB']][pos] += 1

In [17]:
df_atv = pl.DataFrame(data_atv)

df_atv = df_atv.with_columns(
    pl.fold(0, lambda acc, value: acc + value, pl.all().exclude('Activity')).alias('Total')
)

In [18]:
pl.Config.set_fmt_str_lengths(100)
df_atv.sort(by='Total', descending=True).head(20)

Activity,APD3,CancerPPD,DBBASP,FermFooDb,MAHMI,PepBDB,THPdb,Total
str,i32,i32,i32,i32,i32,i32,i32,i32
"""Immunomodulatory""",0,0,0,47,183159,0,0,183206
"""Anti-inflammatory""",40,0,0,1,72258,0,0,72299
"""Not Found""",0,0,20420,0,0,13301,852,34573
"""Migration/Adhesion""",0,0,0,0,15131,0,0,15131
"""Cytotoxic""",0,0,0,0,5852,0,0,5852
"""Anticancer""",271,2977,0,0,0,0,0,3248
"""Anti-Gram+""",2650,0,0,0,0,0,0,2650
"""Anti-Gram-""",2417,0,0,0,0,0,0,2417
"""Antifungal""",1283,303,0,0,0,0,0,1586
"""Anti-microbial""",0,1329,0,25,0,0,0,1354


In [23]:
dt = datetime.now()

# Convert to pandas DataFrame
pd_df = df.to_pandas()

# Export csv from database
pd_df.to_csv(f'{path}/PDBs/LBQCPDB_releases/LBQCPDB_{dt.strftime("%Y_%m_%d")}.csv', index=False)
pd_df.to_csv(f'{path}/LBQCPDB.csv', index=False)

# Export csv from activities analysis
df_atv.write_csv(f'{path}/PDBs/LBQCPDB_releases/LBQCPDB_activities_analysis_{dt.strftime("%Y_%m_%d")}.csv')
df_atv.write_csv(f'{path}/LBQCPDB_activities_analysis.csv')