In [18]:
import os, glob
from collections import defaultdict


dataset_dir = '../raw_data_pdbbind_*'
report_file = 'PDBBind-opt.md'

err_types = defaultdict(list)
pdbids = set()
for err in glob.glob(os.path.join(dataset_dir, '*/err')):
    pdbid = os.path.basename(os.path.dirname(err))
    pdbids.add(pdbid)
    with open(err) as f:
        last = f.read().strip().split('\n')[-1]
    if last.startswith('AssertionError: Number of ligand residues'):
        err_types['Covalent'].append(pdbid)
    elif last.startswith('RuntimeError: Rare element in ligand'):
        err_types['Ligand with rare elements'].append(pdbid)
    elif last.startswith('RuntimeError: Too few'):
        err_types['Ligand too small'].append(pdbid)
    elif last.startswith('AssertionError: Steric clash'):
        err_types['Steric clash'].append(pdbid)
    elif last.startswith('fix_ligand.LigandFixException: Number of atoms not match'):
        err_types['Fail to match template SMILES'].append(pdbid)
    elif last.startswith('fix_ligand.LigandFixException: No reference found'):
        err_types['No reference'].append(pdbid)
    elif last.startswith('fix_ligand.LigandFixException:'):
        err_types['Fail to fix ligand'].append(pdbid)
    elif last.startswith('ValueError: No template found') or last.startswith('AssertionError: Not a modified amino acid'):
        err_types['Fail to fix protein'].append(pdbid)
    elif last.startswith('AssertionError: No ligands found'):
        err_types['Fail to find ligand'].append(pdbid)
    else:
        err_types['Others'].append(pdbid)

total = 0
err_types = {key: sorted(list(set(err_types[key]))) for key in err_types}
for key, value in err_types.items():
    print(key, len(value))
    total += len(value)
print("Total", total)

with open(report_file, 'w') as f:
    for key, value in err_types.items():
        value.sort()
        f.write(f'## {key} ({len(err_types[key])})\n')
        for val in value:
            f.write(f'+ {val}\n')
        f.write('\n')


Covalent 958
Fail to match template SMILES 1628
Ligand with rare elements 206
Steric clash 165
No reference 383
Others 48
Fail to find ligand 235
Fail to fix protein 77
Fail to fix ligand 81
Ligand too small 1
Total 3782


In [1]:
import pandas as pd


hiq_poly_before = pd.read_csv('../pre_process/hiq_poly.csv')['PDBID'].unique().tolist()
hiq_sm_before = pd.read_csv('../pre_process/hiq_sm.csv')['PDBID'].unique().tolist()

hiq_poly_after = pd.read_csv('../analysis/hiq_poly_metadata.csv')['PDBID'].unique().tolist()
hiq_sm_after = pd.read_csv('../analysis/hiq_sm_metadata.csv')['PDBID'].unique().tolist()

In [4]:
common = set(hiq_poly_before).intersection(hiq_sm_before)
for pdbid in common:
    print(pdbid)
    if pdbid in hiq_poly_after:
        print('Polymer success')
    else:
        print('Polyer failed')
    
    if pdbid in hiq_sm_after:
        print('SM success')
    else:
        print('SM failed')

1o7o
Polymer success
SM success
1o9f
Polymer success
SM success
1ga8
Polyer failed
SM failed
1gx4
Polyer failed
SM success
4d4u
Polyer failed
SM failed
1gwv
Polymer success
SM success


In [13]:
import pandas as pd


pdbbind_opt_poly_before = pd.read_csv('../pre_process/PDBBind_poly.csv')['PDBID'].unique().tolist()
pdbbind_opt_sm_before = pd.read_csv('../pre_process/PDBBind_sm.csv')['PDBID'].unique().tolist()

pdbbind_opt_poly_after = pd.read_csv('../analysis/pdbbind_opt_poly_metadata.csv')['PDBID'].unique().tolist()
pdbbind_opt_sm_after = pd.read_csv('../analysis/pdbbind_opt_sm_metadata.csv')['PDBID'].unique().tolist()

In [25]:
with open('list') as f:
    ids = f.read().replace('+ ', '').split()
    ids = sorted(set(ids))

In [26]:
', '.join(ids)

'11gs, 1a07, 1a0t, 1a2c, 1a37, 1a3e, 1abf, 1abt, 1af2, 1agm, 1apb, 1apv, 1apw, 1aqc, 1at5, 1at6, 1atl, 1aze, 1azx, 1b11, 1b2m, 1b40, 1b6j, 1bap, 1bdl, 1bdq, 1bm2, 1bm6, 1bsk, 1bt6, 1bux, 1bzh, 1c5o, 1c5p, 1c5z, 1cka, 1ckb, 1clu, 1cpi, 1cyn, 1czq, 1d4w, 1d6s, 1d8e, 1dkd, 1dmb, 1dva, 1dxp, 1e03, 1e5j, 1eb1, 1ec9, 1eef, 1ej4, 1eoj, 1eol, 1epq, 1eub, 1evh, 1ez9, 1f47, 1f4y, 1f5k, 1ff1, 1fh7, 1fh8, 1fh9, 1fhd, 1fls, 1fwu, 1fwv, 1g42, 1g6g, 1g9r, 1ga8, 1gag, 1gah, 1gai, 1gmy, 1gni, 1gnj, 1gnm, 1gnn, 1gno, 1gny, 1gu3, 1gui, 1gvu, 1gvx, 1gwm, 1gwq, 1gwr, 1gwv, 1gzc, 1h00, 1h07, 1h24, 1h25, 1h26, 1h27, 1h28, 1h2t, 1h2u, 1h5v, 1h6e, 1hc9, 1hgt, 1hkj, 1hkk, 1hkm, 1hps, 1htg, 1i3z, 1i6v, 1i7c, 1i7m, 1i8h, 1i8i, 1idg, 1igj, 1iht, 1ikt, 1ilq, 1iq1, 1is0, 1it6, 1iwq, 1j19, 1j1a, 1j4q, 1jd5, 1jd6, 1jfh, 1jh1, 1jm4, 1jmq, 1jn2, 1jp5, 1jpl, 1juq, 1jvp, 1k1y, 1k9q, 1kat, 1kc5, 1kcs, 1kjr, 1kl3, 1kl5, 1kna, 1kne, 1l6m, 1lek, 1lf8, 1lf9, 1lkk, 1ll4, 1lqe, 1lt5, 1lxh, 1m7d, 1m7i, 1mf4, 1mfa, 1mfd, 1mhw, 1mp

In [None]:


|-- 1a69_FMB_A_240_ligand.pdb
       |-- 1a69_FMB_A_240_protein.pdb
       |-- 1a69_FMB_A_240_protein_hetatm.pdb
       |-- 1a69_FMB_A_240_hetatm.pdb
       |-- 1a69_FMB_A_240_ligand_refined.sdf
       |-- 1a69_FMB_A_240_protein_refined.pdb