In [1]:
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.rdchem import BondType as BT
from src.datasets.dataset_utils import to_nx
import pandas as pd
from joblib import Parallel, delayed
from collections import Counter
import numpy as np

In [4]:
from rdkit import Chem
from tqdm import tqdm

# 读取数据文件
data = open('/data2/chensm22/HRS/data/polymers/train.txt').readlines()

# 解析每个分子的 SMILES 字符串
mols = [Chem.MolFromSmiles(s.strip()) for s in tqdm(data)]

# 使用集合去重存储所有出现的原子类型
atom_types = set()

# 遍历每个分子，获取其中的原子类型
for mol in tqdm(mols):
    if mol is not None:  # 确保分子解析成功
        for atom in mol.GetAtoms():
            atom_types.add(atom.GetSymbol())  # 获取原子的元素符号并加入集合

# 打印所有不同的原子类型
print("All unique atom types in the dataset:")
print(atom_types)


  0%|          | 0/76353 [00:00<?, ?it/s]

100%|██████████| 76353/76353 [00:25<00:00, 3034.32it/s]
100%|██████████| 76353/76353 [00:05<00:00, 13299.18it/s]

All unique atom types in the dataset:
{'O', 'P', 'Si', 'C', 'N', 'F', 'S'}





In [7]:
# atom_encoder = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
# atom_decoder = ['H', 'C', 'N', 'O', 'F']
# bonds = {None:0, BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3, BT.AROMATIC: 4}
atom_encoder = {'C': 0, 'N': 1, 'O': 2, 'F': 3, 'Br': 4, 'Cl': 5, 'I': 6, 'P': 7, 'S': 8}
atom_decoder = ['C', 'N', 'O', 'F', 'Cl', 'I', 'P', 'S']

atom_encoder = {'C': 0, 'N': 1, 'O':2,  'F': 3, 'P': 4, 'S':5, 'Si': 6}
atom_decoder = ['C', 'N', 'O', 'F', 'P', 'S', 'Si']
bonds = {None:0, BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3, BT.AROMATIC: 4}

# suppl = Chem.SDMolSupplier('/data2/chensm22/HRS/data/qm9/qm9_pyg/raw/gdb9.sdf', removeHs=False, sanitize=False)
# graphs = []
# for i, s in enumerate(tqdm(suppl)):
#     G = to_nx(s,atom_encoder, bonds)
#     graphs.append(G)

graphs = []
# splits = [0,240000,245000,-1]
# df = pd.read_csv('/data2/chensm22/HRS/data/zinc250k/zinc250k_property.csv')
# smile_list = df.loc[splits[0]:splits[0+1],'smile']

smile_list = open('/data2/chensm22/HRS/data/polymers/raw/train.txt').readlines()

print("Converting smiles...")
graphs = Parallel(n_jobs=-1, batch_size='auto')(
    delayed(to_nx)(s, atom_encoder, bonds) for s in tqdm(smile_list)
)


Converting smiles...


100%|██████████| 76353/76353 [00:30<00:00, 2540.61it/s]


In [8]:
max_nodes = max(graphs,key=lambda g: g.number_of_nodes()).number_of_nodes()
min_nodes = min(graphs,key=lambda g: g.number_of_nodes()).number_of_nodes()
avg_nodes = sum(g.number_of_nodes() for g in graphs) / len(graphs)
avg_nodes, max_nodes, min_nodes

(51.01616177491389, 122, 5)

In [9]:
count = Counter([g.number_of_nodes() for g in graphs])  # 统计词频

n = np.array(list(count.keys()))
perm = n.argsort()
n = n[perm]
val = np.array(list(count.values()))[perm]
n,val/sum(val)

(array([  5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,
         18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
         31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
         44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
         57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
         83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
         96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
        109, 110, 111, 112, 113, 114, 115, 116, 118, 120, 121, 122]),
 array([1.30970623e-05, 1.30970623e-05, 2.61941247e-05, 2.61941247e-05,
        7.85823740e-05, 1.17873561e-04, 1.04776499e-04, 1.96455935e-04,
        2.75038309e-04, 4.06008932e-04, 5.10785431e-04, 9.03697301e-04,
        9.16794363e-04, 1.17873561e-03, 1.13944442e-03, 1.32280330e-03,
        1.67642398e-03, 2.89445077e-03, 3.45762445e-03, 4.