In [1]:
import polars as pl
import numpy as np
import utils
import os

In [2]:
path = os.getcwd()
df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [3]:
df.describe()

describe,LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI
str,str,str,str,str,str,str
"""count""","""321928""","""321928""","""321928""","""321928""","""321928""","""321928"""
"""null_count""","""0""","""277252""","""115""","""1147""","""0""","""0"""
"""mean""",,,,,,
"""std""",,,,,,
"""min""","""LBQCPDB_000001...","""148l_S""",""" GAKKGAKKGKKGA...","""Ace-inhibitory...","""APD3""","""Not Found"""
"""max""","""LBQCPDB_321928...","""Not Found""","""yslqmGataikqvk...","""calmodulin Ana...","""THPdb""","""https://doi.or..."
"""median""",,,,,,


In [4]:
# Count of Peptides by Database
df.groupby('Source PDB', maintain_order=True).agg(
    pl.col('Source PDB').count().alias('Number of Peptides')
)

Source PDB,Number of Peptides
str,u32
"""APD3""",3579
"""CancerPPD""",5068
"""DBBASP""",20523
"""FermFooDb""",2205
"""MAHMI""",276400
"""PepBDB""",13301
"""THPdb""",852


In [5]:
# Save unique activities in list
atvs = []

for row in df.select(pl.col('Activity')).rows(named=True):
    if not row['Activity'] is None:
        for atv in row['Activity'].split('|'):
            if not atv in atvs:
                atvs.append(atv)

In [6]:
# Analysis peptides activities
data_atv = {
    'Activity': atvs,
    'APD3': np.full(len(atvs), 0),
    'CancerPPD': np.full(len(atvs), 0),
    'DBBASP': np.full(len(atvs), 0),
    'FermFooDb': np.full(len(atvs), 0),
    'MAHMI': np.full(len(atvs), 0),
    'PepBDB': np.full(len(atvs), 0),
    'THPdb': np.full(len(atvs), 0)
}

for row in df.rows(named=True):
    if not row['Activity'] is None:
        for atv in row['Activity'].split('|'):
            pos = data_atv['Activity'].index(atv)
            data_atv[row['Source PDB']][pos] += 1

In [7]:
df_atv = pl.DataFrame(data_atv)

df_atv = df_atv.with_columns(
    pl.fold(0, lambda acc, value: acc + value, pl.all().exclude('Activity')).alias('Total')
)

In [8]:
df_atv.filter(pl.col('Total') > 1000).sort(by='Total', descending=True)

Activity,APD3,CancerPPD,DBBASP,FermFooDb,MAHMI,PepBDB,THPdb,Total
str,i64,i64,i64,i64,i64,i64,i64,i64
"""Immunomodulato...",0,0,0,10,183159,0,0,183169
"""Anti-inflammat...",0,0,0,1,72258,0,0,72259
"""Not Found""",0,0,20523,0,0,13301,852,34676
"""Migration/Adhe...",0,0,0,0,15131,0,0,15131
"""Cytotoxic""",0,0,0,0,5852,0,0,5852
"""Anticancer""",31,2726,0,0,0,0,0,2757


In [9]:
# 10 peptides with more activities
df.with_columns(
    pl.col('Activity').apply(lambda atv: len(atv.split('|'))).alias('N. Activities')
).sort(by='N. Activities', descending=True).limit(10)

LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI,N. Activities
str,str,str,str,str,str,i64
"""LBQCPDB_000001...","""AP00001""","""GLWSKIKEVGKEAA...","""Anti-Gram+ & G...","""APD3""","""Not Found""",1
"""LBQCPDB_000002...","""AP00002""","""YVPLPNVPQPGRRP...","""Anti-Gram+ & G...","""APD3""","""Not Found""",1
"""LBQCPDB_000003...","""AP00003""","""DGVKLCDVPSGTWS...","""Antifungal""","""APD3""","""Not Found""",1
"""LBQCPDB_000004...","""AP00004""","""NLCERASLTWTGNC...","""Anti-Gram+, An...","""APD3""","""Not Found""",1
"""LBQCPDB_000005...","""AP00005""","""VFIDILDKVENAIH...","""Anti-Gram+""","""APD3""","""Not Found""",1
"""LBQCPDB_000006...","""AP00006""","""GNNRPVYIPQPRPP...","""Anti-Gram+ & G...","""APD3""","""Not Found""",1
"""LBQCPDB_000007...","""AP00007""","""GNNRPVYIPQPRPP...","""Anti-Gram-""","""APD3""","""Not Found""",1
"""LBQCPDB_000008...","""AP00008""","""RLCRIVVIRVCR""","""Anti-Gram+ & G...","""APD3""","""Not Found""",1
"""LBQCPDB_000009...","""AP00009""","""RFRPPIRRPPIRPP...","""Anti-Gram-""","""APD3""","""Not Found""",1
"""LBQCPDB_000010...","""AP00010""","""RRIRPRPPRLPRPR...","""Anti-Gram-, Ch...","""APD3""","""Not Found""",1
