In [11]:
import polars as pl
import numpy as np
import utils
import os

In [2]:
path = os.getcwd()
df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [35]:
df.describe()

describe,LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI
str,str,str,str,str,str,str
"""count""","""321825""","""321825""","""321825""","""321825""","""321825""","""321825"""
"""null_count""","""0""","""297672""","""115""","""1147""","""0""","""0"""
"""mean""",,,,,,
"""std""",,,,,,
"""min""","""LBQCPDB_000001...","""148l_S""",""" GAKKGAKKGKKGA...","""Ace-inhibitory...","""APD3""","""Not Found"""
"""max""","""LBQCPDB_321825...","""Not Found""","""yslqmGataikqvk...","""Red blood cell...","""THPdb""","""https://doi.or..."
"""median""",,,,,,


In [41]:
# Count of Peptides by Database
df.groupby('Source PDB', maintain_order=True).agg(
    pl.col('Source PDB').count().alias('Number of Peptides')
)

Source PDB,Number of Peptides
str,u32
"""APD3""",3579
"""CancerPPD""",5068
"""DBBASP""",20420
"""FermFooDb""",2205
"""MAHMI""",276400
"""PepBDB""",13301
"""THPdb""",852


In [3]:
# Analysis peptides sequences
data_seq = {
    'ID': [],
    'Source ID': [],
    'Status': [],
    'Sequence': [],
    'Activity': [],
    'Source PDB': []
}

for row in df.select(pl.exclude(['DOI'])).rows(named=True):
    # Empty peptides
    if row['Sequence'] in ['', None, 'Not Found', 'NA', 'NaN', 'N.A', 'N.A.']:
        data_seq['Status'].append('Empty')
        
    # Peptides with special chars
    elif not row['Sequence'].isalpha():
        data_seq['Status'].append('Not Alpha')
        
    # Peptides more than 70 chars
    elif len(row['Sequence']) >= 70:
        data_seq['Status'].append('More Than 70')
        
    # Peptides more than 40 chars
    elif len(row['Sequence']) >= 40:
        data_seq['Status'].append('More Than 40')
        
    else:
        continue
        
    data_seq['ID'].append(row['LBQCPDB ID'])
    data_seq['Sequence'].append(row['Sequence'])
    data_seq['Source PDB'].append(row['Source PDB'])
    data_seq['Source ID'].append(row['Source ID'])
    data_seq['Activity'].append(row['Activity'])

In [4]:
df_seqs = pl.DataFrame(data_seq)
df_seqs.groupby('Status', maintain_order=True).agg(
    pl.col('Status').count().alias('Number of Peptides')
)

Status,Number of Peptides
str,u32
"""More Than 40""",13739
"""More Than 70""",676
"""Not Alpha""",1184
"""Empty""",171


In [12]:
utils.pl_to_csv(df_seqs, path, 'LBQCPDB_sequences_analysis')

Save files in:
D:\WilliamJSS\Projects\LBQC\LBQC-PDB/releases/LBQCPDB_sequences_analysis_2023_03_12.csv
D:\WilliamJSS\Projects\LBQC\LBQC-PDB/LBQCPDB_sequences_analysis.csv


In [9]:
pl.Config.set_fmt_str_lengths(100) # set column length
df_seqs.filter(pl.col('Status') == 'Not Alpha')

ID,Source ID,Status,Sequence,Activity,Source PDB
str,str,str,str,str,str
"""LBQCPDB_003580""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))2""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003581""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))3""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003582""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))3""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003583""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))3""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003584""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))3""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003585""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))4""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003586""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))4""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003587""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))4""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003588""","""Not Found""","""Not Alpha""","""(K-Aib-C(CH2CO-2'-Pac))4""","""Anticancer""","""CancerPPD"""
"""LBQCPDB_003589""","""Not Found""","""Not Alpha""","""2-[[2-(dimethylamino)-3-methylbutanoyl]amino]-N- [3-methoxy-1-[2-[1-methoxy-2-methyl-3-oxo-3-[[2-p...","""Antineoplastic""","""CancerPPD"""


In [62]:
# Save unique activities in list
atvs = []

for row in df.select(pl.col('Activity')).rows(named=True):
    if not row['Activity'] is None:
        for atv in row['Activity'].split('|'):
            if not atv in atvs:
                atvs.append(atv)

In [63]:
# Analysis peptides activities
data_atv = {
    'Activity': atvs,
    'APD3': np.full(len(atvs), 0),
    'CancerPPD': np.full(len(atvs), 0),
    'DBBASP': np.full(len(atvs), 0),
    'FermFooDb': np.full(len(atvs), 0),
    'MAHMI': np.full(len(atvs), 0),
    'PepBDB': np.full(len(atvs), 0),
    'THPdb': np.full(len(atvs), 0)
}

for row in df.rows(named=True):
    if not row['Activity'] is None:
        for atv in row['Activity'].split('|'):
            pos = data_atv['Activity'].index(atv)
            data_atv[row['Source PDB']][pos] += 1

In [64]:
df_atv = pl.DataFrame(data_atv)

df_atv = df_atv.with_columns(
    pl.fold(0, lambda acc, value: acc + value, pl.all().exclude('Activity')).alias('Total')
)

In [77]:
df_atv.filter(pl.col('Total') > 1000).sort(by='Total', descending=True)

Activity,APD3,CancerPPD,DBBASP,FermFooDb,MAHMI,PepBDB,THPdb,Total
str,i32,i32,i32,i32,i32,i32,i32,i32
"""Immunomodulatory""",0,0,0,47,183159,0,0,183206
"""Anti-inflammatory""",40,0,0,1,72258,0,0,72299
"""Not Found""",0,0,20420,0,0,13301,852,34573
"""Migration/Adhesion""",0,0,0,0,15131,0,0,15131
"""Cytotoxic""",0,0,0,0,5852,0,0,5852
"""Anticancer""",271,2977,0,0,0,0,0,3248
"""Anti-Gram+""",2650,0,0,0,0,0,0,2650
"""Anti-Gram-""",2417,0,0,0,0,0,0,2417
"""Antifungal""",1283,303,0,0,0,0,0,1586
"""Anti-microbial""",0,1329,0,25,0,0,0,1354


In [75]:
# Peptides with more activities
df.with_columns(
    pl.col('Activity').apply(lambda atv: len(atv.split('|'))).alias('N. Activities')
).sort(by='N. Activities', descending=True).limit(10)

LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI,N. Activities
str,str,str,str,str,str,i64
"""LBQCPDB_000310""","""AP00310""","""LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Antiparasitic|Spermicidal|Anti-HIV|Chemotac...","""APD3""","""https://doi.org/10.1111/j.1432-1033.1996.0325z.x""",18
"""LBQCPDB_000144""","""AP00144""","""GIGKFLHSAKKFGKAFVGEIMNS""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Antiparasitic|Insecticidal|Spermicidal|Anti...","""APD3""","""Not Found""",13
"""LBQCPDB_000176""","""AP00176""","""ACYCRIPACIAGERRYGTCIYQGRLWAFCC""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Antiparasitic|Anti-HIV|Chemotactic|Anti-MRSA|Anti-toxin|...","""APD3""","""Not Found""",13
"""LBQCPDB_000283""","""AP00283""","""GIINTLQKYYCRVRGGRCAVLSCLPKEEQIGKCSTRGRKCCRRKK""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Anti-HIV|Chemotactic|Anti-MRSA|Anti-toxin|Synergistic AM...","""APD3""","""Not Found""",12
"""LBQCPDB_000524""","""AP00524""","""GIGDPVTCLKSGAICHPVFCPRRYKQIGTCGLPGTKCCKKP""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Anti-HIV|Chemotactic|Anti-toxin|Channel inh...","""APD3""","""Not Found""",12
"""LBQCPDB_000146""","""AP00146""","""GIGAVLKVLTTGLPALISWIKRKRQQ""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Antiparasitic|Insecticidal|Anti-HIV|Anti-se...","""APD3""","""Not Found""",11
"""LBQCPDB_000160""","""AP00160""","""ALWMTLLKKVLKAAAKAALNAVLVGANA""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Antiparasitic|Spermicidal|Anti-HIV|Anti-sep...","""APD3""","""Not Found""",11
"""LBQCPDB_000366""","""AP00366""","""GRFKRFRKKFKKLFKKLSPVIPLLHLG""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Candidacidal|Antiparasitic|Anti-HIV|Anti-MRSA|Hemolytic|...","""APD3""","""Not Found""",11
"""LBQCPDB_001161""","""AP01161""","""GRDYRTCLTIVQKLKKMVDKPTQRSVSNAATRVCRTGRSRWRDVCRNFMRRYQSRVTQGLVAGETAQQICEDLR""","""Anti-Gram+|Anti-Gram-|Antifungal|Candidacidal|Antiparasitic|Chemotactic|Anti-TB|Anti-sepsis|Synergi...","""APD3""","""Not Found""",11
"""LBQCPDB_000150""","""AP00150""","""ILPWKWPWWPWRR""","""Anti-Gram+|Anti-Gram-|Antiviral|Antifungal|Anti-HIV|Anti-MRSA|Hemolytic|Antibiofilm|Wound healing|A...","""APD3""","""Not Found""",10
