In [1]:
import polars as pl
import numpy as np
import utils
import os

In [2]:
path = os.getcwd()
df = pl.read_csv(f'{path}/LBQCPDB.csv')

In [11]:
status = []
amino_codes = 'ARNDCQEGHILKMFPSTWYV'

for row in df.rows(named=True):   
    # Delete blank spaces
    if not row['Sequence'] is None:
        seq = row['Sequence'].strip()
    
    # Empty peptides
    if row['Sequence'] in ['', None, 'Not Found', 'NA', 'NaN', 'N.A', 'N.A.']:
        status.append('Empty')
        
    # Peptides with special chars
    elif not seq.isalpha():
        status.append('Not Alpha')
        
    # Peptides more than 40 chars
    elif len(seq) > 40:
        status.append('More Than 40')
    
    # Peptides with length equals 1
    elif len(seq) == 1:
        status.append('Length 1')
        
    # Peptides without correct amino_code
    elif not any(code in amino_codes for code in seq.upper()):
        status.append('Not aminocode')
        
    else:
        status.append('OK')

In [12]:
df_seqs = df.with_columns(pl.Series(name='Status', values=status))
df_seqs.groupby('Status').agg(
    pl.col('Status').count().alias('Number of Peptides')
).sort(by='Status')

# Remember run again after filter the rows

Status,Number of Peptides
str,u32
"""Empty""",1067
"""Length 1""",78
"""More Than 40""",13363
"""Not Alpha""",2
"""Not aminocode""",410
"""OK""",307228


In [None]:
pl.Config.set_fmt_str_lengths(80) # set column length
pl.Config.set_tbl_rows(10) # set rows count
df_seqs.filter(pl.col('Status') == 'Not Alpha')

In [5]:
sequences = []

new_entries = {
    'LBQCPDB ID': [],
    'Source ID': [],
    'Sequence': [],
    'Activity': [],
    'Source PDB': [],
    'DOI': []
}

for row in df_seqs.rows(named=True):
    
    # Ignore sequences not equals 'not alpha'
    if row['Status'] != 'Not Alpha':
        sequences.append(row['Sequence'])
    
    # CancerPPD
    elif row['Source PDB'] == 'CancerPPD':

        # Equals 'Structure Given'
        if row['Sequence'] == 'Structure Given':
            sequences.append(None)
        
        # Contain '*', '-' or ',' chars
        elif any(c in '*-,' for c in row['Sequence']):
            sequences.append(None)
        
        # Contain blank spaces
        elif ' ' in row['Sequence']:
            seq = ''.join(row['Sequence'].split(' '))
            sequences.append(seq)
        
        # Contain break lines
        else:
            seq = ''.join(row['Sequence'].split('\n'))
            sequences.append(seq)
        
    # DBBASP
    elif row['Source PDB'] == 'DBBASP':     
        sequences.append(None)
    
    # FermFooDb
    elif row['Source PDB'] == 'FermFooDb':
                
        if '(' in row['Sequence']:
            seq = row['Sequence'].replace('oxid', ' oxid').split(' ')[0]
            sequences.append(seq.split('oxidation')[0])
            
        elif '/' in row['Sequence']:            
            [seq1, seq2] = row['Sequence'].split('/')

            if len(seq1) > len(seq2):
                sequences.append(seq1)
                new_entries['Sequence'].append(''.join([seq1[:len(seq1)-1], seq2]))

            elif len(seq1) < len(seq2):
                sequences.append(seq2)
                new_entries['Sequence'].append(''.join([seq1, seq2[1:]]))

            else:
                sequences.append(seq1)
                new_entries['Sequence'].append(seq2)
            
            new_entries['LBQCPDB ID'].append(row['LBQCPDB ID'])
            new_entries['Source ID'].append(row['Source ID'])
            new_entries['DOI'].append(row['DOI'])
            new_entries['Activity'].append(row['Activity'])
            new_entries['Source PDB'].append(row['Source PDB'])
            
        elif ' ' in row['Sequence']:
            row_seq = ''.join(row['Sequence'].split(' '))
            
            if row_seq.isalpha() and row_seq.isupper():
                sequences.append(row_seq)
                
            elif row['Sequence'].split(' ')[0].isupper():
                sequences.append(row['Sequence'].split(' ')[0])
            
            elif row['Sequence'].split(' ')[1].isupper():
                sequences.append(row['Sequence'].split(' ')[1])
                
            else:
                sequences.append(None)
            
        else:
            sequences.append(None)
                
    # THPdb
    elif row['Source PDB'] == 'THPdb':

        row_seq = row['Sequence']
        
        # Set row sequence
        if not ':' in row_seq:
            
            if 'chain' in row['Sequence']:
                replaces = {
                    'chain 1': 'chain',
                    'chain': 'chain:'
                }

                row_seq = utils.multi_replace(row['Sequence'], replaces)
                
            elif ')' in row['Sequence']:          
                row_seq = row['Sequence'].replace('|', ')')
                row_seq = f'chain:{row_seq.split(")")[-1]} '

            else:
                replaces = {
                    'Light Chain': 'chain:',
                    'Ranibizumab Heavy Chain': 'Heavy chain:',
                    'alpha-1': 'chain:',
                    'Inteferon alpha-2': 'Heavy chain:',
                    'hormone': 'chain:',
                    ',': '',
                    'Sequence': 'chain:',
                    'Ofatumumab Heavy Chain': 'chain:'
                }

                row_seq = utils.multi_replace(row['Sequence'], replaces)
                
                if not '-' in row_seq and not ':' in row_seq:
                    row_seq = f'chain:{row_seq} '
                    
        # Add row_seq to sequences list
        if len(row_seq.split(':')) == 2:
            sequences.append(row_seq.split(':')[1].split(';')[0].strip())
            
        elif len(row_seq.split(':')) == 3:
            replaces = {
                'Alpha': ';', 'Beta': ';', 'Light': ';', 'Heavy': ';',
                'B-chain': ';', 'B chain': ';', ',': ';',
            }
            
            row_seq = utils.multi_replace(row_seq, replaces)
            sequences.append(row_seq.split(':')[1].split(';')[0].strip())
            new_entries['Sequence'].append(row_seq.split(':')[2].split(';')[0].strip())
            new_entries['LBQCPDB ID'].append(row['LBQCPDB ID'])
            new_entries['Source ID'].append(row['Source ID'])
            new_entries['DOI'].append(row['DOI'])
            new_entries['Activity'].append(row['Activity'])
            new_entries['Source PDB'].append(row['Source PDB'])
            
        else:
            sequences.append(None)

In [10]:
# Change peptides sequences
df = df.with_columns(pl.Series(name='Sequence', values=sequences))

In [6]:
# New entries from duplicates peptides
df_dup = pl.DataFrame(new_entries)
df_dup

LBQCPDB ID,Source ID,Sequence,Activity,Source PDB,DOI
str,str,str,str,str,str
"""LBQCPDB_030377...","""FMDB987""","""VVN""","""Antioxidant""","""FermFooDb""","""Not Found"""
"""LBQCPDB_030379...","""FMDB989""","""LKRP""","""Antioxidant""","""FermFooDb""","""Not Found"""
"""LBQCPDB_030993...","""FMDB1603""","""RL""",,"""FermFooDb""","""Not Found"""
"""LBQCPDB_031000...","""FMDB1610""","""MPL""",,"""FermFooDb""","""Not Found"""
"""LBQCPDB_321406...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""
"""LBQCPDB_321407...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""
"""LBQCPDB_321408...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""
"""LBQCPDB_321409...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""
"""LBQCPDB_321410...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""
"""LBQCPDB_321411...",,"""CDLPQTHSLGSRRT...","""Not Found""","""THPdb""","""Not Found"""


In [13]:
df = df_seqs.filter(pl.col('Status') == 'OK').drop('Status')

In [14]:
utils.pl_to_csv(df, path, 'LBQCPDB_filtered')

Save files in:
/home/pcbio/Área de Trabalho/William/LBQC-PDB/releases/LBQCPDB_filtered_2023_03_14.csv
/home/pcbio/Área de Trabalho/William/LBQC-PDB/LBQCPDB_filtered.csv
