In [1]:
import polars as pl
import numpy as np
import utils
import os

In [2]:
path = os.getcwd()
df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [19]:
data_seq = {
    'ID': [],
    'Source ID': [],
    'Status': [],
    'Sequence': [],
    'Activity': [],
    'Source PDB': []
}

amino_codes = 'ARNDCQEGHILKMFPSTWYV'

for row in df.select(pl.exclude(['DOI'])).rows(named=True):   
    # Delete blank spaces
    if not row['Sequence'] is None:
        seq = row['Sequence'].strip()
    
    # Empty peptides
    if row['Sequence'] in ['', None, 'Not Found', 'NA', 'NaN', 'N.A', 'N.A.']:
        data_seq['Status'].append('Empty')
        
    # Peptides with special chars
    elif not seq.isalpha():
        data_seq['Status'].append('Not Alpha')
        
    # Peptides more than 40 chars
    elif len(seq) > 40:
        data_seq['Status'].append('More Than 40')
    
    # Peptides with length equals 1
    elif len(seq) == 1:
        data_seq['Status'].append('Length 1')
        
    # Peptides without correct amino_code
    elif not any(code in amino_codes for code in seq.upper()):
        data_seq['Status'].append('Not aminocode')
        
    else:
        continue
        
    data_seq['ID'].append(row['LBQCPDB ID'])
    data_seq['Sequence'].append(seq)
    data_seq['Source PDB'].append(row['Source PDB'])
    data_seq['Source ID'].append(row['Source ID'])
    data_seq['Activity'].append(row['Activity'])

In [20]:
df_seqs = pl.DataFrame(data_seq)
df_seqs.groupby('Status', maintain_order=True).agg(
    pl.col('Status').count().alias('Number of Peptides')
)

Status,Number of Peptides
str,u32
"""More Than 40""",13146
"""Not Alpha""",1162
"""Length 1""",78
"""Not aminocode""",410
"""Empty""",171


In [21]:
pl.Config.set_tbl_rows(410) # set rows count
df_seqs.filter(pl.col('Status') == 'Not aminocode')

ID,Source ID,Status,Sequence,Activity,Source PDB
str,str,str,str,str,str
"""LBQCPDB_015733...","""DBBASPS_7307""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015734...","""DBBASPS_7308""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015745...","""DBBASPS_7319""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015746...","""DBBASPS_7320""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015748...","""DBBASPS_7322""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015750...","""DBBASPS_7324""","""Not aminocode""","""XX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015751...","""DBBASPS_7325""","""Not aminocode""","""XXX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015752...","""DBBASPS_7326""","""Not aminocode""","""XXX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015766...","""DBBASPS_7340""","""Not aminocode""","""XXX""","""Not Found""","""DBBASP"""
"""LBQCPDB_015767...","""DBBASPS_7341""","""Not aminocode""","""XXX""","""Not Found""","""DBBASP"""


In [91]:
pl.Config.set_fmt_str_lengths(100) # set column length
df_seqs_nalpha = df_seqs.filter(pl.col('Status') == 'Not Alpha')

for row in df_seqs_nalpha.rows(named=True):
    
    # CancerPPD
    #if row['Source PDB'] == 'CancerPPD':
        
        
    # DBBASP
    if row['Source PDB'] == 'DBBASP':
            
        if len(row['Sequence'].split(' ')):
            print(row['ID'], row['Source ID'])
            print(row['Sequence'], end='\n\n')
            
    
    # FermFooDb
    if row['Source PDB'] == 'FermFooDb':
        
        if '(' in row['Sequence']:
            seqs = row['Sequence'].split(' ')[0]
            seqs = seqs.split('oxidation')[0]

        elif '/' in row['Sequence']:
            [seq1, seq2] = row['Sequence'].split('/')
            seqs = []

            if len(seq1) > len(seq2):
                seqs.append(seq1)
                seqs.append(''.join([seq1[:len(seq1)-1], seq2]))

            elif len(seq1) < len(seq2):
                seqs.append(seq2)
                seqs.append(''.join([seq1, seq2[1:]]))

            else:
                seqs = [seq1, seq2]

        #elif ' ' in row['Sequence']:
        #    print(row['ID'], row['Source ID'])
        #    print(row['Sequence'], end='\n\n')

LBQCPDB_008648 DBBASPR_1
NLVSGLIEARKYLEQLHRKLKNCKV    ENREVPPGFTALIKTLRKCKII

LBQCPDB_008649 DBBASPR_3
WLNALLHHGLNCAKGVLA    ALLHHGLNCAKGVLA

LBQCPDB_008650 DBBASPS_4
KWLNALLHHGLNCAKGVLA    ALLHHGLNCAKGVLA

LBQCPDB_008651 DBBASPR_6
SNDSLWYGVGQFMGKQANCITNHPVKHMIIPGYCLSKILG    IAPIIVAGLGYLVKDAWDHSDQIISGFKKGWNGGRRK

LBQCPDB_008652 DBBASPR_7
XTCASRCPRPCNAGLCCSIYGYCGSGNAYCGAGNCRCQCRG    XTCASRCPRPCNAGLCCSIYGYCGSGAAYCGAGNCRCQCRG

LBQCPDB_008654 DBBASPR_9
SIWGDIGQGVGKAAYWVGKAMGNMSDVNQASRINRKKKH    GTWDDIGQGIGRVAYWVGKAMGNMSDVNQASRINRKKKH

LBQCPDB_008662 DBBASPR_17
SIWGDIGQGVGKAAYWVGKAMGNMSDVNQASRINRKKKH    KKWGWLAWVEPAGEFLKGFGKGAIKEGNKDKWKNI

LBQCPDB_008663 DBBASPR_18
GTWDDIGQGIGRVAYWVGKAMGNMSDVNQASRINRKKKH    KKWGWLAWVDPAYEFIKGFGKGAIKEGNKDKWKNI

LBQCPDB_008664 DBBASPR_19
GTWDDIGQGIGRVAYWVGKAMGNMSDVNQASRINRKKKH    KKWGWLAWVEPAGEFLKGFGKGAIKEGNKDKWKNI

LBQCPDB_008665 DBBASPR_21
KKWGWLAWVDPAYEFIKGFGKGAIKEGNKDKWKNI    KKWGWLAWVEPAGEFLKGFGKGAIKEGNKDKWKNI

LBQCPDB_008666 DBBASPR_22
KKWGWLAWVDPAYEFIK