In [1]:
import pandas as pd

file_path = 'benchmark.csv'  
df = pd.read_csv(file_path)

filtered_df = df[df['NPS'] == 1]

smiles_list = filtered_df['smiles'].tolist()

In [2]:
no_filtered_df = df[df['NPS'] == 0]

In [3]:
no_smiles_list = no_filtered_df['smiles'].tolist()

In [4]:
smiles_list

['CCN([C@@H]1CCCC[C@H]1N(C)C)C(=O)C2=CC(=C(C=C2)Cl)Cl',
 'CCCCCN1C=C(C=C1C2=CC=CC=C2)C(=O)C3=CC=CC4=CC=CC=C43',
 'CC1=CC[C@H]2[C@H]3CC4=C5[C@]2([C@H]1OC5=C(C=C4)O)CCN3C',
 'CCCCCN1C=C(C2=CC=CC=C21)C(=O)CC3=CC=CC=C3OC',
 'C1CN(CCN1)C2=CC=C(C=C2)Cl',
 'CC1=CC=C(C=C1)C(=O)C(C)NCC2=CC=CC=C2',
 'CC1=CC(=C(C=C1OC)CCN)OC',
 'COC1=CC(=C(C=C1CCN)OC)C(F)(F)F',
 'CCC(C)N(CC(C1=CC=CN1CC2=CC=CC=C2F)O)C(C)CC',
 'CC1=C(C=C(C(=C1Br)OC)CCN)OC',
 'CCCCCCC(C)(C)C1=CC(=C2[C@@H]3CC(=O)CC[C@H]3C(OC2=C1)(C)C)O',
 'CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2',
 'CCN(CC)C(=O)[C@H]1CN([C@@H]2CC3=CN(C4=CC=CC(=C34)C2=C1)C(=O)C5CC5)C',
 'CCCC(CC1=CC=CC=C1)NCCC',
 'CC1=CC=CC=C1N2CCNCC2',
 'CNC1(CCCCC1=O)C2=CC=CC=C2',
 'CC(C1=CC=C(C=C1)Br)N2CCC(CC2)N3C4=CC=CC=C4NC3=O',
 'CCN(CC)CCN1C2=C(C=C(C=C2)[N+](=O)[O-])N=C1CC3=CC=C(C=C3)F',
 'CCCC(=O)N1CCN(CC1C)C/C=C/C2=CC=CC=C2',
 'CC1=CC=C(C=C1)CCN2CCC(CC2)N(C3=CC=CC=C3)C(=O)C',
 'C1CC[C@@]23CCN([C@@H]([C@@H]2C1)CC4=C3C=C(C=C4)O)CCC5=CC=CC=C5',
 'CCCCCN1C=C(C2=CC=CC=C21)CC(=O)NC3CCCCC

In [5]:
from rdkit import Chem
import re

def preprocess_smiles(smiles_list):

    processed = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if not mol: continue
        std_smi = Chem.MolToSmiles(mol) # 显式所有键
        processed.append(std_smi)
    return processed


clean_smiles = preprocess_smiles(smiles_list)

In [6]:
clean_smiles

['CCN(C(=O)c1ccc(Cl)c(Cl)c1)[C@@H]1CCCC[C@H]1N(C)C',
 'CCCCCn1cc(C(=O)c2cccc3ccccc23)cc1-c1ccccc1',
 'CC1=CC[C@H]2[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3C)[C@H]1O5',
 'CCCCCn1cc(C(=O)Cc2ccccc2OC)c2ccccc21',
 'Clc1ccc(N2CCNCC2)cc1',
 'Cc1ccc(C(=O)C(C)NCc2ccccc2)cc1',
 'COc1cc(CCN)c(OC)cc1C',
 'COc1cc(C(F)(F)F)c(OC)cc1CCN',
 'CCC(C)N(CC(O)c1cccn1Cc1ccccc1F)C(C)CC',
 'COc1cc(CCN)c(OC)c(Br)c1C',
 'CCCCCCC(C)(C)c1cc(O)c2c(c1)OC(C)(C)[C@@H]1CCC(=O)C[C@@H]21',
 'CCC1(c2ccccc2)C(=O)NC(=O)NC1=O',
 'CCN(CC)C(=O)[C@@H]1C=C2c3cccc4c3c(cn4C(=O)C3CC3)C[C@H]2N(C)C1',
 'CCCNC(CCC)Cc1ccccc1',
 'Cc1ccccc1N1CCNCC1',
 'CNC1(c2ccccc2)CCCCC1=O',
 'CC(c1ccc(Br)cc1)N1CCC(n2c(=O)[nH]c3ccccc32)CC1',
 'CCN(CC)CCn1c(Cc2ccc(F)cc2)nc2cc([N+](=O)[O-])ccc21',
 'CCCC(=O)N1CCN(C/C=C/c2ccccc2)CC1C',
 'CC(=O)N(c1ccccc1)C1CCN(CCc2ccc(C)cc2)CC1',
 'Oc1ccc2c(c1)[C@@]13CCCC[C@H]1[C@@H](C2)N(CCc1ccccc1)CC3',
 'CCCCCn1cc(CC(=O)NC2CCCCC2)c2ccccc21',
 'O=C(CN1CC[C@]23CCCC[C@H]2[C@H]1Cc1ccc(O)cc13)c1ccccc1',
 'CCN',
 'C[C@H](N)[C@H](O)c1c

In [7]:
import re
from rdkit import Chem
from rdkit import RDLogger
from collections import defaultdict

def mine_fragments(smiles_list, min_len=2, max_len=6):
    fragment_counts = defaultdict(int)

    RDLogger.DisableLog('rdApp.*')
    
    for smi in smiles_list:
        for length in range(min_len, max_len+1):
            for i in range(len(smi) - length + 1):
                frag = smi[i:i+length]
                if is_valid_fragment(frag):
                    fragment_counts[frag] += 1
                    
    RDLogger.EnableLog('rdApp.*')
    return fragment_counts

def is_valid_fragment(frag):
    if frag.count('(') != frag.count(')') or frag.count('[') != frag.count(']'):
        return False
    

    ring_numbers = re.findall(r'%\d+|\d', frag)  # 支持%10等格式
    for n in set(ring_numbers):
        if ring_numbers.count(n) % 2 != 0:
            return False
    

    if frag[0] in ['-', '=', '#', '.'] or frag[-1] in ['-', '=', '#', '.']:
        return False
    

    if re.search(r'[-=#.]{2,}', frag):
        return False
    

    if frag.count('(') > 0:
        stack = []
        for c in frag:
            if c == '(':
                stack.append(c)
            elif c == ')':
                if not stack:
                    return False
                stack.pop()
        if stack:
            return False
    
    try:
        mol = Chem.MolFromSmarts(frag)
        return mol is not None and mol.GetNumAtoms() > 0
    except:
        return False


fragment_counts = mine_fragments(smiles_list)
print(f"{dict(fragment_counts)}")

{'CC': 6384, 'CN': 821, 'Cl': 192, 'CCN': 447, 'CCC': 1548, 'C=C': 2957, 'CCCC': 752, 'N(C)': 91, '[C@H]': 254, 'N(C)C': 84, 'C(=O)': 864, '[C@@H]': 225, 'C[C@H]': 66, 'C(=O)C': 477, 'CCCN': 103, 'CC=C': 2067, 'C=CC': 945, 'CCCCC': 348, 'CCCCN': 84, 'CC=CC': 872, 'CCCCCN': 56, 'C=CC=C': 821, 'OC': 683, '[C@]': 77, 'NC': 542, 'NCC': 90, 'C(C)': 270, 'C(C)N': 56, 'C(C)NC': 19, 'CO': 266, 'COC': 175, 'C(F)': 6, 'C=CN': 2, 'CC(C)': 172, 'CC=CN': 1, 'C(C)C': 99, 'CCC(C)': 25, 'CC(C)N': 42, 'C(C)CC': 24, 'Br': 58, 'CCCCCC': 119, 'CC(=O)': 142, 'C(=O)N': 245, 'NC(=O)': 114, 'N(CC)': 36, 'C5CC5': 2, 'CN(CC)': 30, 'N(CC)C': 36, 'NCCC': 25, 'CNC': 172, 'CCNC': 75, 'CNCC': 46, 'CCNCC': 27, 'N=C': 78, '[N+]': 42, '[O-]': 47, 'C/C': 20, 'C/C=C': 12, 'C=C/C': 6, '[C@@]': 61, 'C[C@@]': 27, 'CS': 26, 'C=CS': 20, 'CC=CS': 20, 'NC(C)': 34, 'CNC(C)': 27, 'NC(C)C': 30, 'C=N': 21, 'OC(=O)': 100, 'CNCCC': 16, 'OCC': 51, 'NN': 85, 'CCO': 80, 'CCOC': 67, 'C=C(C)': 1, 'N=CC': 13, 'N=CC=C': 11, 'N[C@H]': 7, 'SC

In [8]:
no_fragment_counts = mine_fragments(no_smiles_list)
print(f"{dict(no_fragment_counts)}")

{'Br': 28, 'NC': 1560, 'CC': 7792, 'CN': 1165, 'BrC': 2, 'NCC': 193, 'CCN': 530, 'C=C': 3517, 'N=C': 240, 'NCCN': 29, 'C=CC': 927, 'NC=C': 82, 'C=CN': 39, 'CN=C': 86, 'NC=CN': 7, 'C=CN=C': 27, 'C#C': 32, 'CCC': 1651, 'C#CC': 10, 'CC=C': 2014, 'C#CCN': 2, 'CC=CC': 697, '[C@@H]': 1159, 'C=CC=C': 706, 'CNC': 251, 'C=N': 177, 'CNCC': 47, 'N=CC': 27, 'CC=N': 60, 'C=NC': 67, 'CC=NC': 17, 'C2C=C2': 1, 'N=CC=N': 2, 'CCCC': 662, 'CNCCN': 5, 'CC#C': 7, 'C(C)': 704, 'CC#CC': 3, 'C(=O)': 1893, 'CC#CCN': 1, 'C(=O)N': 566, 'C(C)=N': 9, 'CCNC': 101, 'C(C)C': 270, 'CCNCC': 25, 'CC(=O)': 254, 'C(=O)C': 669, 'S(=O)': 113, 'CCCCC': 308, 'NC(=O)': 526, 'OC': 1317, '[N+]': 143, '[O-]': 214, 'C(O)': 694, 'C(O)C': 246, 'CO': 732, 'C(I)': 37, 'CC(O)': 225, 'C(I)C': 16, 'CC(O)C': 64, 'C(O)CN': 11, 'C(C)=O': 87, 'NCC(O)': 40, 'C(O)CO': 30, 'N(O)': 5, 'CCCN': 176, 'N(O)C': 5, 'CCCCN': 61, 'CCCNC': 43, 'N(O)CC': 3, 'CCCCCN': 10, 'CCCCNC': 12, 'Cl': 328, 'OCC': 249, 'COC': 417, 'C(Cl)': 203, 'C(Cl)C': 129, 'C=C(O)

In [9]:
def filter_fragments(fragment_counts, min_freq=5, top_k=50):
    sorted_frags = sorted(fragment_counts.items(), 
                        key=lambda x: -x[1])

    filtered = [(f, c) for f, c in sorted_frags 
               if c >= min_freq][:top_k]
    return dict(filtered)


high_freq_fragments = filter_fragments(fragment_counts, min_freq=5)
print(high_freq_fragments)

{'CC': 6384, 'C=C': 2957, 'CC=C': 2067, 'CCC': 1548, 'C=CC': 945, 'CC=CC': 872, 'C(=O)': 864, 'CN': 821, 'C=CC=C': 821, 'CCCC': 752, 'OC': 683, 'NC': 542, 'C(=O)C': 477, 'CCN': 447, 'CCCCC': 348, 'C(C)': 270, 'CO': 266, '[C@H]': 254, 'C(=O)N': 245, '[C@@H]': 225, 'Cl': 192, 'COC': 175, 'CC(C)': 172, 'CNC': 172, 'CC(=O)': 142, 'CCCCCC': 119, 'NC(=O)': 114, 'C(=O)O': 111, 'CCCN': 103, 'OC(=O)': 100, 'C(C)C': 99, 'N(C)': 91, 'NCC': 90, 'NN': 85, 'N(C)C': 84, 'CCCCN': 84, 'CCO': 80, 'N=C': 78, '[C@]': 77, 'CCNC': 75, 'CCOC': 67, 'C[C@H]': 66, '[C@@]': 61, 'Br': 58, 'CCCCCN': 56, 'C(C)N': 56, 'C[C@]': 56, 'OCO': 52, 'OCC': 51, 'CF': 50}


In [10]:
no_high_freq_fragments = filter_fragments(no_fragment_counts, min_freq=5)
print(no_high_freq_fragments)

{'CC': 7792, 'C=C': 3517, 'CC=C': 2014, 'C(=O)': 1893, 'CCC': 1651, 'NC': 1560, '[C@H]': 1341, 'OC': 1317, 'CN': 1165, '[C@@H]': 1159, 'C=CC': 927, '[H]': 913, '[C@@]': 831, 'CO': 732, 'C=CC=C': 706, 'C(C)': 704, 'CC=CC': 697, 'C(O)': 694, 'C(=O)C': 669, 'CCCC': 662, '[C@]': 582, 'C(=O)N': 566, 'CCN': 530, 'NC(=O)': 526, 'COC': 417, 'C[C@H]': 388, 'Cl': 328, 'C(O)=O': 326, 'CCCCC': 308, 'C(C)C': 270, 'C[C@]': 260, 'CC(=O)': 254, 'CNC': 251, 'OCC': 249, 'C(O)C': 246, 'N=C': 240, 'OC(=O)': 239, 'C(N)': 238, 'CC(O)': 225, 'CCO': 225, '[O-]': 214, 'C(F)': 214, 'SC': 211, 'N(C)': 211, 'CC(C)': 205, 'C(Cl)': 203, 'C[C@@]': 198, 'N(C)C': 196, 'NCC': 193, 'O[C@H]': 192}
