In [12]:
import polars as pl
import numpy as np
import utils
import os

In [13]:
path = os.getcwd()
df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [34]:
status = []
amino_codes = 'ARNDCQEGHILKMFPSTWYV'

for row in df.rows(named=True):   
    # Delete blank spaces
    if not row['Sequence'] is None:
        seq = row['Sequence'].strip()
    
    # Empty peptides
    if row['Sequence'] in ['', None, 'Not Found', 'NA', 'NaN', 'N.A', 'N.A.']:
        status.append('Empty')
        
    # Peptides with special chars
    elif not seq.isalpha():
        status.append('Not Alpha')
        
    # Peptides more than 40 chars
    elif len(seq) > 40:
        status.append('More Than 40')
    
    # Peptides with length equals 1
    elif len(seq) == 1:
        status.append('Length 1')
        
    # Peptides without correct amino_code
    elif not any(code in amino_codes for code in seq.upper()):
        status.append('Not aminocode')
        
    else:
        status.append('OK')

In [35]:
df_seqs = df.with_columns(pl.Series(name='Status', values=status))
df_seqs.groupby('Status').agg(
    pl.col('Status').count().alias('Number of Peptides')
).sort(by='Number of Peptides')

# Remember run again after filter the rows

Status,Number of Peptides
str,u32
"""OK""",306961


In [30]:
pl.Config.set_fmt_str_lengths(100) # set column length
pl.Config.set_tbl_rows(1162) # set rows count
df_seqs.filter(pl.col('Status') == 'Not Alpha').select(pl.col(['Sequence', 'Source ID']))

Sequence,Source ID
str,str
"""(K-Aib-C(CH2CO-2'-Pac))2""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))3""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))3""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))3""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))3""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))4""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))4""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))4""","""Not Found"""
"""(K-Aib-C(CH2CO-2'-Pac))4""","""Not Found"""
"""2-[[2-(dimethylamino)-3-methylbutanoyl]amino]-N- [3-methoxy-1-[2-[1-methoxy-2-methyl-3-oxo-3-[[2-p...","""Not Found"""


In [None]:
status = []

for row in df_seqs.rows(named=True):
    
    if row['Status'] != 'Not Alpha':
        status.append(row['Status'])
        continue
    
    # Verify only peptides sequences not alpha
    else:
    
    # CancerPPD
    #if row['Source PDB'] == 'CancerPPD':
        
        
    # DBBASP
    if row['Source PDB'] == 'DBBASP':
            
        if len(row['Sequence'].split(' ')):
            print(row['ID'], row['Source ID'])
            print(row['Sequence'], end='\n\n')
            
    
    # FermFooDb
    if row['Source PDB'] == 'FermFooDb':
        
        if '(' in row['Sequence']:
            seqs = row['Sequence'].split(' ')[0]
            seqs = seqs.split('oxidation')[0]

        elif '/' in row['Sequence']:
            [seq1, seq2] = row['Sequence'].split('/')
            seqs = []

            if len(seq1) > len(seq2):
                seqs.append(seq1)
                seqs.append(''.join([seq1[:len(seq1)-1], seq2]))

            elif len(seq1) < len(seq2):
                seqs.append(seq2)
                seqs.append(''.join([seq1, seq2[1:]]))

            else:
                seqs = [seq1, seq2]

        #elif ' ' in row['Sequence']:
        #    print(row['ID'], row['Source ID'])
        #    print(row['Sequence'], end='\n\n')

In [33]:
df = df_seqs.filter(pl.col('Status') == 'OK').drop('Status')

In [37]:
utils.pl_to_csv(df, path, 'LBQCPDB_filtered')

Save files in:
D:\WilliamJSS\Projects\LBQC\LBQC-PDB/releases/LBQCPDB_filtered_2023_03_14.csv
D:\WilliamJSS\Projects\LBQC\LBQC-PDB/LBQCPDB_filtered.csv
