In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import utils
import os

In [2]:
path = os.getcwd()
pdbs = ['APD3', 'CancerPPD', 'DBBASP', 'FermFooDb', 'MAHMI', 'PepBDB', 'THPdb']

In [3]:
dfs = {}
for pdb in pdbs:
    dfs[pdb] = pd.read_csv(f'{path}/PDBs_releases/{pdb}.csv')

In [4]:
count_id = 1
data = {
    'LBQCPDB ID': [],
    'Source ID': [],
    'Sequence': [],
    'Activity': [],
    'Source PDB': [],
    'DOI': []
}

In [5]:
# APD3
for i in tqdm(range(0, len(dfs['APD3']))):
    line = dfs['APD3'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['APD ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append(line['Activity'])
    data['Source PDB'].append('APD3')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|██████████| 3579/3579 [00:00<00:00, 7598.12it/s]


In [6]:
# CancerPPD
for i in tqdm(range(0, len(dfs['CancerPPD']))):
    line = dfs['CancerPPD'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['CancerPPD ID'])
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append(line['NATURE'])
    data['Source PDB'].append('CancerPPD')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|██████████| 5288/5288 [00:00<00:00, 14125.41it/s]


In [7]:
# DBBASP
for i in tqdm(range(0, len(dfs['DBBASP']))):
    line = dfs['DBBASP'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('DBBASP')
    #data['DOI'].append(line['DOI'])
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████| 20523/20523 [00:01<00:00, 15865.75it/s]


In [8]:
# FermFooDb
for i in tqdm(range(0, len(dfs['FermFooDb']))):
    line = dfs['FermFooDb'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['Link'].split('=')[1])
    data['Sequence'].append(line['Peptide Sequence'])
    data['Activity'].append(line['Activity'])
    data['Source PDB'].append('FermFooDb')
    data['DOI'].append(line['DOI'])
    count_id += 1

100%|██████████| 2205/2205 [00:00<00:00, 7695.11it/s]


In [9]:
# MAHMI
for i in tqdm(range(0, len(dfs['MAHMI']))):
    line = dfs['MAHMI'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['SEQUENCE'])
    data['Activity'].append(line['BIOACTIVITY'])
    data['Source PDB'].append('MAHMI')
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████| 276400/276400 [00:30<00:00, 9168.05it/s]


In [10]:
# PepBDB
for i in tqdm(range(0, len(dfs['PepBDB']))):
    line = dfs['PepBDB'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['Peptide ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('PepBDB')
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████| 13301/13301 [00:01<00:00, 8756.36it/s]


In [11]:
# THPdb
for i in tqdm(range(0, len(dfs['THPdb']))):
    line = dfs['THPdb'].loc[i]
    data['LBQCPDB ID'].append('LBQCPDB_{:6}'.format(count_id).replace(' ', '0'))
    data['Source ID'].append(line['ID'])
    data['Sequence'].append(line['Sequence'])
    data['Activity'].append('Not Found')
    data['Source PDB'].append('THPdb')
    data['DOI'].append('Not Found')
    count_id += 1

100%|██████████| 852/852 [00:00<00:00, 8989.15it/s]


In [12]:
df = pl.DataFrame(data, nan_to_null=True)

In [13]:
replaces = {
    ' & Gram-': ', Anti-Gram-',
    ';': ',',
    'and ': ',',
    'CasoxinA': 'Casoxin A',
    ',,': ',',
    'Anatgonist': 'Antagonist',
    'Antimicrobial': 'Anti-microbial',
    'Antihypertensive': 'Anti-hypertensive'
}

def fix_activities(activity):
    # Call function 'multi_replace' from utils.py
    line = utils.multi_replace(activity, replaces)
    
    # Resolve null activities
    if line in ['', None, 'Not Found', 'NA', 'NaN', 'N.A', 'N.A.']:
        return 'Not Found'
    
    # Resolve activites relates from attenuate, anti-microbial and antibacterial
    if 'Attenuate' in line or 'against' in line:
        atv = line[0].upper() + line[1:]
        return atv.replace(',', ', ')
        
    # Resolve other activities
    list_atv = []
    for atv in line.split(','):
        atv = atv[0].upper() + atv[1:]
        list_atv.append(atv.strip())
    
    return '|'.join(list_atv)

In [14]:
# Fix Activity column in DataFrame
df = df.with_columns(pl.col('Activity').apply(fix_activities))

In [15]:
utils.pl_to_csv(df, path, 'LBQCPDB')

Save files in:
/home/pcbio/Área de Trabalho/William/LBQC-PDB/releases/LBQCPDB_2023_03_14.csv
/home/pcbio/Área de Trabalho/William/LBQC-PDB/LBQCPDB.csv


In [16]:
df

LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI
str,str,str,str,str,str
"""LBQCPDB_000001...","""AP00001""","""GLWSKIKEVGKEAA...","""Anti-Gram+|Ant...","""APD3""","""Not Found"""
"""LBQCPDB_000002...","""AP00002""","""YVPLPNVPQPGRRP...","""Anti-Gram+|Ant...","""APD3""","""Not Found"""
"""LBQCPDB_000003...","""AP00003""","""DGVKLCDVPSGTWS...","""Antifungal""","""APD3""","""Not Found"""
"""LBQCPDB_000004...","""AP00004""","""NLCERASLTWTGNC...","""Anti-Gram+|Ant...","""APD3""","""Not Found"""
"""LBQCPDB_000005...","""AP00005""","""VFIDILDKVENAIH...","""Anti-Gram+""","""APD3""","""Not Found"""
"""LBQCPDB_000006...","""AP00006""","""GNNRPVYIPQPRPP...","""Anti-Gram+|Ant...","""APD3""","""Not Found"""
"""LBQCPDB_000007...","""AP00007""","""GNNRPVYIPQPRPP...","""Anti-Gram-""","""APD3""","""Not Found"""
"""LBQCPDB_000008...","""AP00008""","""RLCRIVVIRVCR""","""Anti-Gram+|Ant...","""APD3""","""Not Found"""
"""LBQCPDB_000009...","""AP00009""","""RFRPPIRRPPIRPP...","""Anti-Gram-""","""APD3""","""Not Found"""
"""LBQCPDB_000010...","""AP00010""","""RRIRPRPPRLPRPR...","""Anti-Gram-|Che...","""APD3""","""Not Found"""
