# CMF  
## 分子综合化特征器  
## Comprehensive Molecular Featurizer   
最初更新：2022.05.24 戴以恒  
贡献者：苏禹铭，陈杨韬，戴以恒
当前版本：5.0  


In [1]:
# 特征器整体参数：
VERSION = 'V5.0'
import time
import gc
# 输入文件：应当包含两列，第一列标题为'smiles'，第二列标题任意，为标签值
INPUT_NAME = 'hxr_242.csv'
loffitxt='./loffi.txt'
CONTAIN_TITLE_ROW = True
#若文件第一行为标题'smiles', 'values'的话应设为True

LOG_NAME = 'CMF'+VERSION+'_'+INPUT_NAME+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())+'.txt'
SMILES_CHECK = True                       # 进行SMILES检查的开关，开启后若存在错误的smiles，将会终止运行
VALUES_DIV_MOLWT = False                 # 将标签值除以摩尔质量的开关
VALUES_LN = False                 # 将标签值取对数值的开关
PANDAS_DATASET_GENERATE = False            # 生成Pandas可用的数据集的开关
SHOWRDKITRELATION=False           #展示各基团之间的rdkit相似性
nonconjmolcount=True             #计算非共轭描述符
ALLPROPOUTPUT=True

In [2]:
# =====================RDKit Descriptor part=====================

# RDKit描述符特征器参数：
# 使用前请先使用Smiles检查程序对数据包中的smiles进行检查，确保
RDKIT_DESC_SWITCH = True          # RDKit描述符特征器的总开关
allowedDescriptors = ['NOCount', 'VSA_EState4', 'NumHDonors', 'SlogP_VSA12', 'NumRadicalElectrons', 
                      'SlogP_VSA4', 'Kappa2', 'Chi2n', 'PEOE_VSA3', 'PEOE_VSA7', 'PEOE_VSA4', 'Chi1', 
                      'MolWt', 'SMR_VSA1', 'SlogP_VSA9', 'VSA_EState9', 'MaxAbsPartialCharge', 'NumSaturatedHeterocycles', 
                      'MaxPartialCharge', 'VSA_EState1', 'PEOE_VSA6', 'EState_VSA11', 'SMR_VSA4', 'EState_VSA7', 
                      'VSA_EState2', 'NHOHCount', 'SlogP_VSA10', 'SMR_VSA7', 'PEOE_VSA9', 'NumAliphaticRings', 'EState_VSA8', 
                      'PEOE_VSA5', 'BertzCT', 'SlogP_VSA1', 'SlogP_VSA6', 'PEOE_VSA1', 'VSA_EState7', 'MinAbsPartialCharge', 
                      'LabuteASA', 'SlogP_VSA2', 'EState_VSA4', 'MolMR', 'Kappa1', 'NumHAcceptors', 'EState_VSA9', 'MolLogP', 
                      'NumAromaticHeterocycles', 'BalabanJ', 'FractionCSP3', 'SMR_VSA3', 'RingCount', 'NumSaturatedRings', 
                      'PEOE_VSA2', 'MaxAbsEStateIndex', 'Kappa3', 'Chi3n', 'NumRotatableBonds', 'Chi4n', 'VSA_EState3', 
                      'SMR_VSA8', 'MinPartialCharge', 'EState_VSA6', 'SMR_VSA9', 'PEOE_VSA13', 'NumValenceElectrons', 
                      'MaxEStateIndex', 'SMR_VSA6', 'VSA_EState8', 'EState_VSA2', 'NumAromaticCarbocycles', 'SMR_VSA10', 
                      'SlogP_VSA3', 'HallKierAlpha', 'PEOE_VSA14', 'HeavyAtomCount', 'VSA_EState10', 'SlogP_VSA11', 
                      'ExactMolWt', 'MinAbsEStateIndex', 'TPSA', 'PEOE_VSA10', 'SMR_VSA2', 'Chi1v', 'Chi4v', 'PEOE_VSA8', 
                      'EState_VSA5', 'Chi1n', 'VSA_EState5', 'SlogP_VSA7', 'HeavyAtomMolWt', 'MinEStateIndex', 
                      'NumAliphaticHeterocycles', 'VSA_EState6', 'Chi0v', 'SlogP_VSA5', 'SMR_VSA5', 'Chi0', 'Chi2v', 
                      'NumSaturatedCarbocycles', 'NumAromaticRings', 'Chi0n', 'PEOE_VSA12', 'Chi3v', 'NumAliphaticCarbocycles', 
                      'EState_VSA10', 'EState_VSA3', 'EState_VSA1', 'NumHeteroatoms', 'SlogP_VSA8', 'PEOE_VSA11']

In [3]:
# =====================Conjugation Descriptor part=====================

# 共轭特征器模块参数：
CONJU_DESC_SWITCH = True                  # 共轭描述符特征器的总开关
KEEP_TYPE = 'max'           # 保留方式，设为'max'、'mean'或'acc-mean'
patt_list_d = ['C=C', 'C#C', 'C#N', 'C=O', 'C=S', 'C=N', 'N=N', '[N+]([O-])=O']
patt_list_m = ['N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P']
one_list = ['C']
two_list = ['B','N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I','Si']

In [4]:
# 附加溶剂模块参数：
import numpy as np
import pandas as pd
SOLVENT_SWITCH =False             # 附加溶剂
SOLVENT_IN = 'hxr_240_si_final.csv'


In [5]:
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs
from rdkit.Chem.EState.EState import EStateIndices
import rdkit
import itertools as it

In [6]:
# 创建一个目录来保存生成的数据包
import os
from pathlib import Path
DIR = 'Loffi_'+VERSION+'_'+time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
os.mkdir(DIR)

In [7]:
# 输出数据的创建
X_out = []
title_out = []
smiles_out = []
values_out = []

In [8]:
# 数据集的读取
if CONTAIN_TITLE_ROW:
    dataset = np.loadtxt(INPUT_NAME, dtype=str, delimiter=',', comments='!', skiprows=1)
else:
    dataset = np.loadtxt(INPUT_NAME, dtype=str, delimiter=',', comments='!')
smiles_rd = dataset[:, 0].flatten().tolist()
values_rd = dataset[:, 1].astype(float).flatten().tolist()

# 检查Smiles
if SMILES_CHECK:
    error_list = []
    for smi in smiles_rd:
        try:
            mol = Chem.MolFromSmiles(smi)
        except:
            print('error:', smi)
# 将标签值除以分子质量
if VALUES_DIV_MOLWT:
    values_rd_dwt = []
    for descriptor, function in Descriptors.descList:
        if descriptor=='MolWt':
            for i in range(len(smiles_rd)):
                smile = smiles_rd[i]
                mol = Chem.MolFromSmiles(str(smile))
                v = values_rd[i]/function(mol)
                if v==0:
                    print(values_rd[i], function(mol))
                values_rd_dwt.append(v)
    values_dwt_out = np.array(values_rd_dwt).reshape(len(values_rd_dwt), 1)
smiles_out = np.array(smiles_rd).reshape(len(smiles_rd), 1)
values_out = np.array(values_rd).reshape(len(values_rd), 1)



In [9]:
# 读取loffi.txt内容，其中包括准确的基团名称和基团的SMARTS表达式
with open(loffitxt) as f:
    loffi = f.readlines()
loffi_title=[]
loffi_smarts=[]
for i in loffi:
    loffi_title.append(i.split(':')[0])
    loffi_smarts.append(i.split(':')[1][:-1])
print(loffi_title)

['Alkene', 'Alkyne', 'Allene', 'Alkylchloride', 'Alkylfluoride', 'Alkylbromide', 'Alkyliodide', 'Alcohol', 'Dialkylether', 'Alkylarylether', 'Diarylether', 'Diarylthioether', 'Oxonium', 'Primary_aliph_amine', 'Secondary_aliph_amine', 'Tertiary_aliph_amine', 'Quaternary_aliph_ammonium', 'Primary_arom_amine', 'Secondary_arom_amine', 'Tertiary_arom_amine', 'Quaternary_arom_ammonium', 'Ammonium', 'Alkylthiol', 'Dialkylthioether', 'Alkylarylthioether', 'Disulfide', '1comma2-Aminoalcohol', '1comma2-Diol', '1comma1-Diol', 'Hydroperoxide', 'Peroxo', 'Organometallic_compounds', 'Aldehyde', 'Ketone', 'Thioaldehyde', 'Thioketone', 'Imine', 'Immonium', 'Oxime', 'Oximether', 'Acetal', 'Hemiacetal', 'Aminal', 'Hemiaminal', 'Thioacetal', 'Thiohemiacetal', 'Chloroalkene', 'Fluoroalkene', 'Bromoalkene', 'Iodoalkene', 'Enol', 'Enamine', 'Acylhalide', 'Carboxylic_acid', 'Carboxylic_ester', 'Lactone', 'Carboxylic_anhydride', 'Thioacetate', 'Ethanethioic', 'Amide', 'Lactam', 'Alpha_aminoacid', 'Alpha_hydro

In [10]:
def find_fused(aroatomrings, ring_id):
    neighbours = []
    for i in range(len(aroatomrings)):
        if len(set(aroatomrings[i]) & set(aroatomrings[ring_id])) == 2:
            neighbours.append(i)
    return neighbours

In [11]:
#找到mol的芳香环描述符
loffi_list_aro=[] #芳香环的总列表，累加了每一个分子的所有的芳香环，每一个芳香环的smiles缩写叫作loffi_title_aro
loffi_m_aro=[]  #芳香环的字典的矩阵
loffi_m_fun=[] #预设的官能团特征矩阵
for _ in range(len(smiles_rd)):
    mol=Chem.MolFromSmiles(smiles_rd[_])
    print(smiles_rd[_])
    loffi_dict_aro={}
    aroatomrings=[]
    used_smarts=[]
    ring_info = mol.GetRingInfo()
    atomrings = ring_info.AtomRings()
    for i in atomrings:
    #     判断删掉非芳香环
        if mol.GetAtomWithIdx(i[0]).GetIsAromatic() == True:
            aroatomrings.append(i)
    # 稠环的识别
    labels = np.zeros(len(aroatomrings))
    num_fused_ring = 1
    for ring_id in range(len(aroatomrings)):
        if labels[ring_id] == 0:
            neighbours = find_fused(aroatomrings, ring_id)
            labels[ring_id] = num_fused_ring
            for ring in neighbours:
                labels[ring] = num_fused_ring
            while len(neighbours) > 0:
                current_ring = neighbours[0]
                current_neighbour = find_fused(aroatomrings, current_ring)
                for i in current_neighbour:
                    if labels[i] == 0:
                        neighbours = np.append(neighbours, i)
                        labels[i] = num_fused_ring
                neighbours = neighbours[1:]
            num_fused_ring += 1

    fuseringlist=[]
    if len(labels)>0:
        for i in range(int(max(labels))):
            rings = []
            for j in np.where(labels==i+1)[0]:
                rings.append(list(aroatomrings[j]))
            fuseringlist.append(list(set(it.chain(*rings))))
        # # 稠环的命名
        for j in range(len(fuseringlist)):
            hit_at=fuseringlist[j]
            hit_at = list(map(int, fuseringlist[j]))
            tempmol=Chem.MolFragmentToSmiles(mol, atomsToUse=tuple(hit_at))
            loffi_list_aro.append(tempmol)
            loffi_dict_aro[tempmol] = loffi_dict_aro.get(tempmol,0) + 1
        loffi_m_aro.append(loffi_dict_aro)

    else:
        loffi_m_aro.append({})


#         官能团的SMARTS查询
    for jj in range(len(loffi_title)):   
        patt = Chem.MolFromSmarts(loffi_smarts[jj])
        matches = mol.GetSubstructMatches(patt)
        used_smarts.append(len(matches))
    loffi_m_fun.append(used_smarts)
# 汇总
loffi_title_aro=list(set(loffi_list_aro))
title = loffi_title_aro.copy()
title.extend(loffi_title)


O=C(C1=C(C(O)=O)C=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=C(C(O)=O)C=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=CC=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=CC(C(O)=O)=CC(C(O)=O)=C1)O
O=C(C1=CC=CC(C(O)=O)=C1C(O)=O)O
O=C(C1=C(C(O)=O)C=CC=N1)O
O=C(C1=CC=C(C(O)=O)C=C1)O
CS
Cl[Fe]123<-N4=C5C=CC4=C(c4ccccc4)c4ccc(n41)C(c1ccccc1)=C1C=CC(=N->21)C(c1ccccc1)=c1ccc(n13)=C5c1ccccc1
C1=CC2=N3->[Ni]45<-N6=C(C=CC6=C(c6ccccc6)c6ccc(n64)C(c4ccccc4)=C13)C(c1ccccc1)=c1ccc(n15)=C2c1ccccc1
C1(/C(C2=CC=CC=C2)=C(C=C/3)\NC3=C4\C5=CC=CC=C5)=N/C(C=C1)=C(C6=CC=CC=C6)\C7=CC=C(/C(C8=CC=CC=C8)=C9C=CC4=N/9)N7
Fc1c(F)c(F)c(C2=C3C=CC4=N3->[Fe]35(Cl)<-N6=C(C=CC6=C(c6c(F)c(F)c(F)c(F)c6F)c6ccc2n63)C(c2c(F)c(F)c(F)c(F)c2F)=c2ccc(n25)=C4c2c(F)c(F)c(F)c(F)c2F)c(F)c1F
COc1ccc(C2=C3C=CC4=N3->[Fe]35(Cl)<-N6=C(C=CC6=C(c6ccc(OC)cc6)c6ccc2n63)C(c2ccc(OC)cc2)=c2ccc(n25)=C4c2ccc(OC)cc2)cc1
C1=CC2=N3->[Co]45<-N6=C(C=CC6=C(c6ccccc6)c6ccc(n64)C(c4ccccc4)=C13)C(c1ccccc1)=c1ccc(n15)=C2c1ccccc1
CC1=CC=C(N2C=CC(C3C=CN(C4=CC=C(C)C=C4)C=C3)C=C2)C=C1.CC1=CC=C(N2C(C3C=CC=C



OC(=O)c1ccncc1
O=C(C1=CC=NC=C1)O
N[C@@H](CS)C(O)=O
C1(C2=NC=CC=C2)=CC=CC=N1
c1ccc(cc1)c2ccccn2
Cc1cc(CO)c(O)c(CO)c1
C1(C2=NC(C3=NC=CC=C3)=CC=C2)=CC=CC=N1
OC(=O)c1ccccc1O
OC(=O)C#CC(O)=O
Nc1cccnc1
NCCCC(O)=O
Oc1ccccc1O
OCC(CO)(CO)NCC(O)=O
NC1=CC(C2=CC=C(N)C(N)=C2)=CC=C1N
Nc1ncnc2nc[nH]c12
OC(=O)/C=C/C(O)=O
OC(=O)c1cccc(c1)C(O)=O
O=C1C(O)=C(O)[C@@]([H])([C@@H](O)CO)O1
CC(C)(CO)C(O)=O
Nc1cc(ccc1C(O)=O)C(O)=O
c1cc(ccn1)c2ccncc2
CC(N)=S
OC1C(O)C(O)C(O)C(O)C1O
CC(=O)N[C@H]1C(O)O[C@H](CO)[C@@H](O)[C@@H]1O
O[C@H]1CN[C@H](C1)C(O)=O
NC1(CCCCC1)C(O)=O
O[P](O)(=O)c1ccncc1
OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O
OCC(CO)(CO)CO
OCC(O)=O
CC(O)C(O)=O
Oc1ccncc1
OCc1ccccc1
O=S(C1=CC(P(C2=CC=CC(S(=O)([O-])=O)=C2)C3=CC=CC(S(=O)([O-])=O)=C3)=CC=C1)([O-])=O
ON1C(=O)c2ccccc2C1=O
O[P](=O)(c1ccccc1)c2ccccc2
COC(=O)c1cc(OC)ccn1
COc1ccnc(c1)c2cc(OC)ccn2
Oc1cccc[n+]1[O-]
C#CCCCCO
OCc1occc1
COC(=O)c1ccc(nc1)C(O)=O
CC(N)CO
O[C@@H]([C@H](O)C(O)=O)C(O)=O
NCCCC[C@H](N)C(O)=O
N[C@@H](CCCNC(N)=N)C(O)=O
Oc1ccc(O)cc1
Nc1ccc

In [12]:
data = np.zeros((len(smiles_rd), len(title)))
title = np.array(title)
for i in range(len(smiles_rd)):
    aro_data = loffi_m_aro[i]
    for j in aro_data:
        index = np.where(title==j)[0]
        data[i, index] = aro_data[j]
    data[i, len(loffi_title_aro):]=loffi_m_fun[i]

In [13]:
# 基于rdkit的特征相关性
realtitle=loffi_title_aro.copy()
realtitle.extend(loffi_smarts)
if SHOWRDKITRELATION==True:
    print(loffi_title_aro)
    title_mat=np.zeros((len(realtitle),len(realtitle)))
    for i in range(len(realtitle)):
        for j in range(len(realtitle)):
            if i!=j:
                fps1 = Chem.RDKFingerprint(Chem.MolFromSmarts(realtitle[i]))
                fps2 = Chem.RDKFingerprint(Chem.MolFromSmarts(realtitle[j]))
                sm = DataStructs.FingerprintSimilarity(fps1,fps2,metric=DataStructs.DiceSimilarity)
                title_mat[i][j]=sm
    plt.figure(figsize=(5,5),dpi=100)
    plt.imshow(title_mat)
    plt.figure(figsize=(5,5),dpi=100)
    title_matfla=title_mat.flatten()
    plt.hist(title_matfla)

In [14]:
# loffi数据集的整理
print(data.shape)
print(len(title))
X = data
y = values_rd
title = np.array(title).reshape(X.shape[1], 1)
X_out.append(X)
title_out.append(title)

(249, 140)
140


In [15]:
# RDKit描述符特征器模块
if RDKIT_DESC_SWITCH:
    descriptors = []
    descList = []
    for descriptor, function in Descriptors.descList:
        if descriptor in allowedDescriptors:
            descriptors.append(descriptor)
            descList.append((descriptor, function))
    title_rd = np.array(descriptors).reshape(len(descriptors), 1)
    def _featurize(mol):
        rval = []
        for desc_name, function in descList:
            rval.append(function(mol))
        return rval
    Mol = []
    for i in range(len(smiles_rd)):
        mol = Chem.MolFromSmiles(smiles_rd[i])
        Mol.append(_featurize(mol))
    Mol = np.array(Mol)
    print(Mol.shape)
    X_out.append(Mol)
    title_out.append(title_rd)



(249, 110)


In [16]:
# 初始化原子半径、键长矩阵，用于计算VSA：
if CONJU_DESC_SWITCH:
    dij_m = np.zeros((54, 54))
    ri_m = np.zeros((54, 1))
    ri_m[6, 0] = 1.950  # C
    ri_m[7, 0] = 1.950  # N
    ri_m[8, 0] = 1.779  # O
    ri_m[9, 0] = 1.496  # F
    ri_m[15, 0] = 2.287  # P
    ri_m[16, 0] = 2.185  # S
    ri_m[17, 0] = 2.044  # Cl
    ri_m[35, 0] = 2.166  # Br
    ri_m[53, 0] = 2.358  # I
    dij_m[6, 35] = 1.970  # C-Br
    dij_m[35, 6] = 1.970
    dij_m[7, 35] = 1.840  # N-Br
    dij_m[35, 7] = 1.840
    dij_m[6, 6] = 1.540  # C-C
    dij_m[7, 7] = 1.450  # N-N
    dij_m[8, 8] = 1.470  # N-N
    dij_m[6, 17] = 1.800  # C-Cl
    dij_m[17, 6] = 1.800
    dij_m[6, 9] = 1.350  # C-F
    dij_m[9, 6] = 1.350
    dij_m[6, 53] = 2.120  # C-I
    dij_m[53, 6] = 2.120
    dij_m[6, 7] = 1.470  # C-N
    dij_m[7, 6] = 1.470
    dij_m[6, 8] = 1.430  # C-N
    dij_m[8, 6] = 1.430
    dij_m[6, 15] = 1.850  # C-P
    dij_m[15, 6] = 1.850
    dij_m[6, 16] = 1.810  # C-S
    dij_m[16, 6] = 1.810
    dij_m[7, 8] = 1.460  # N-O
    dij_m[8, 7] = 1.460
    dij_m[7, 15] = 1.600  # N-P
    dij_m[15, 7] = 1.600
    dij_m[7, 16] = 1.760  # N-S
    dij_m[16, 7] = 1.760
    dij_m[8, 15] = 1.570  # O-P
    dij_m[15, 8] = 1.570
    dij_m[8, 16] = 1.570  # O-S
    dij_m[16, 8] = 1.570

In [17]:
def find_conju(mol, a_m, d_m):
    global patt_list_d, patt_list_m
    ring_list = []
    f_a = []
    patt = Chem.MolFromSmarts('c')
    atomids = mol.GetSubstructMatches(patt)
    atoms = mol.GetAtoms()
    temp_list = []
    
    def find_ring(atom_id, found_atoms):
        nonlocal a_m, ring_list, f_a, atoms, temp_list
        flag = False
        c_list = np.argwhere(a_m[atom_id] == 1).flatten().tolist()
        for atom in c_list:
            if atom not in f_a:
                a = atoms[atom]
                if a.IsInRing() and str(a.GetHybridization())!='SP3':
                    found_atoms.append(atom)
                    f_a.append(atom)
                    find_ring(atom, found_atoms)
                    flag = True
        if not flag:
            temp_list.append(found_atoms)
    
    for atom in atomids:
        a = atom[0]
        if a not in f_a:
            find_ring(a, [])
        if len(temp_list)>0:
            max_ring = temp_list[0]
            for l in temp_list:
                if len(l)>len(max_ring):
                    max_ring = l
            ring_list.append(max_ring)
        temp_list = []
    for patt in patt_list_d:
        f = Chem.MolFromSmarts(patt)
        atomids = mol.GetSubstructMatches(f)
        if len(atomids)>0:
            for pair in atomids:
                n_l = []
                flag_f = False
                for a in pair:
                    if a in f_a:
                        flag_f = True
                        break
                    neighbors = atoms[a].GetNeighbors()
                    for na in neighbors:
                        n_l.append(na.GetIdx())
                if flag_f:
                    continue
                temp = []
                temp_r_id = []
                # 到这里，找到了双、叁键的邻接原子
                for n in n_l:
                    if atoms[n].GetAtomicNum() in [6, 7, 8]:
                        for i in range(len(ring_list)):
                            ring = ring_list[i]
                            if n in ring:
                                temp.append(ring)
                                temp_r_id.append(i)
                if len(temp)==1:
                    ring_list[temp_r_id[0]].append(pair[0])
                    ring_list[temp_r_id[0]].append(pair[1])
                    f_a.append(pair[0])
                    f_a.append(pair[1])
                else:
                    # 合并多个列表
                    t_r = []
                    for r in temp:
                        t_r += r
                    # 加上双、叁键两端
                    t_r.append(pair[0])
                    t_r.append(pair[1])
                    # 删掉原来的环
                    temp_r_id.sort()
                    temp_r_id = np.unique(temp_r_id)
                    for i in reversed(temp_r_id):
                        del ring_list[i]
                    # 加上新环
                    ring_list.append(t_r)
                    f_a.append(pair[0])
                    f_a.append(pair[1])
    for patt in patt_list_m:
        f = Chem.MolFromSmarts(patt)
        atomids = mol.GetSubstructMatches(f)
        if len(atomids)>0:
            for atom in atomids:
                a = atom[0]
                if a not in f_a:
                    neighbors = atoms[a].GetNeighbors()
                    n_l = []
                    for na in neighbors:
                        n_l.append(na.GetIdx())
                    temp = []
                    temp_r_id = []
                    # 到这里，找到了杂原子的邻接原子
                    for n in n_l:
                        for i in range(len(ring_list)):
                            ring = ring_list[i]
                            if (n in ring) and (i not in temp_r_id):
                                temp.append(ring)
                                temp_r_id.append(i)
                    if len(temp)==1:
                        ring_list[temp_r_id[0]].append(a)
                        f_a.append(a)
                    else:
                        # 合并多个列表
                        t_r = []
                        for r in temp:
                            t_r += r
                        # 加上杂原子
                        t_r.append(a)
                        # 删掉原来的环
                        temp_r_id.sort()
                        for i in reversed(temp_r_id):
                            del ring_list[i]
                        # 加上新环
                        if len(t_r)>1:
                            ring_list.append(t_r)
                            f_a.append(a)
    for i in range(len(atoms)):
        if i not in f_a:
            aa = atoms[i]
            if aa.GetSymbol()!='C' or str(aa.GetHybridization())!='SP2':
                continue
            aa_n = aa.GetNeighbors()
            flag = False
            for aaa in aa_n:
                if aaa.GetIdx() in f_a:
                    flag = True
                    break
            if flag:
                a = i
                neighbors = atoms[a].GetNeighbors()
                n_l = []
                for na in neighbors:
                    n_l.append(na.GetIdx())
                temp = []
                temp_r_id = []
                # 到这里，找到了杂原子的邻接原子
                for n in n_l:
                    for i in range(len(ring_list)):
                        ring = ring_list[i]
                        if (n in ring) and (i not in temp_r_id):
                            temp.append(ring)
                            temp_r_id.append(i)
                if len(temp)==1:
                    ring_list[temp_r_id[0]].append(a)
                    f_a.append(a)
                else:
                    # 合并多个列表
                    t_r = []
                    for r in temp:
                        t_r += r
                    # 加上杂原子
                    t_r.append(a)
                    # 删掉原来的环
                    temp_r_id.sort()
                    for i in reversed(temp_r_id):
                        del ring_list[i]
                    # 加上新环
                    if len(t_r)>1:
                        ring_list.append(t_r)
                        f_a.append(a)
    # 最后核对共轭结构是否相连
    if len(ring_list)>1:
        temp_count = 0
        flag = True
        while flag:
            t_temp = int(len(ring_list)*(len(ring_list)-1)/2)
            temp = 0
            break_flag = False
            for i in range(len(ring_list)-1):
                for j in range(len(ring_list)-i-1):
                    r_1 = ring_list[i]
                    r_2 = ring_list[i+j+1]
                    if np.sum(a_m[r_1, :][:, r_2]) == 0:
                        temp += 1
                    else:
                        # 需要进行合并
                        for k in r_2:
                            ring_list[i].append(k)
                        ring_list[i] = np.unique(ring_list[i])
                        del ring_list[i+j+1]
                        break_flag = True
                        break
                if break_flag:
                    break
            if temp == t_temp:
                flag = False
    for i in range(len(ring_list)):
        ring_list[i] = np.unique(ring_list[i]).flatten().tolist()
    return (ring_list, f_a)

In [18]:
def find_elec_num(kind, hyb):
    global one_list, two_list
    if kind in one_list:
        return 1
    elif kind in two_list:
        if hyb=='SP' and kind in ['N', 'P', 'O']:
            return 1
        elif hyb=='SP2' and kind in ['N', 'O', 'S']:
            return 1
        else:
            return 2

In [19]:
def find_acc_mean(l_in, conju_size_list):
    temp = 0
    for i in range(len(l_in)):
        temp += l_in[i] * conju_size_list[i]
    temp /= sum(conju_size_list)
    return temp

In [20]:
# 共轭描述符特征器模块
if CONJU_DESC_SWITCH:
    CONJU_TITLE = []
    CONJU_PRE = [ 'PEOE-Charge', 'Atomic-LogP', 'Atomic-MR']
    CONJU_NUM = 5 + 7 + 8 * len(CONJU_PRE)
    data_conju = np.zeros((len(smiles_rd), CONJU_NUM))
    descList = []
    allowedDescriptors = ['MolWt']
    for descriptor, function in Descriptors.descList:
        if descriptor in allowedDescriptors:
            descList.append((descriptor, function))
    mff_title = realtitle
    for _ in range(len(smiles_rd)):
        smi = smiles_rd[_]
        print(smi)
        mol = Chem.MolFromSmiles(Chem.MolToSmiles(Chem.MolFromSmiles(smi)))
        mol=Chem.AddHs(mol)
        atoms = mol.GetAtoms()
        a_m = Chem.rdmolops.GetAdjacencyMatrix(mol)
        d_m = Chem.rdmolops.GetDistanceMatrix(mol)
        res = find_conju(mol, a_m, d_m)
        ring_list = res[0]
        if len(ring_list)==0:
            lstmax=[]
        else:
            lstmax = sorted(ring_list, key = len)[-1]
        f_a = res[1]
        conju_size_list = [len(r) for r in ring_list]
        ## 计算原子级描述符列表：
        # 计算表观共轭电荷
        app_elec = []
        for a in range(len(atoms)):
            if a in f_a:
                a_kind = atoms[a].GetSymbol()
                hyb = str(atoms[a].GetHybridization())
                app_elec.append(find_elec_num(a_kind, hyb))
            else:
                app_elec.append(0)
        # 计算PEOE电荷
        AllChem.ComputeGasteigerCharges(mol, nIter=25)
        peoe_charge = [mol.GetAtomWithIdx(i).GetDoubleProp('_GasteigerCharge') for i in range(mol.GetNumAtoms())]
#         # 计算电子拓扑系数EState
#         estate_index = EStateIndices(mol)
        # 计算LogP与MR的原子贡献
        contribs = rdMolDescriptors._CalcCrippenContribs(mol)
        logp = [contribs[i][0] for i in range(len(contribs))]
        mr = [contribs[i][1] for i in range(len(contribs))]
        # 结合各原子级描述符
        atom_props = [ peoe_charge,logp, mr]
        
        ## 共轭结构的总和特征：
        if _ == 0:
            CONJU_TITLE.append('Num of Conju-Stru (MFF-Conju)')  # 1.
            CONJU_TITLE.append('Num of Conju-All-Atoms (MFF-Conju)')  # 2.
            CONJU_TITLE.append('Atom Num Conju-All Ratio (MFF-Conju)')  # 3.
            CONJU_TITLE.append('AtomWt Conju-All Ratio (MFF-Conju)')  # 4.
            CONJU_TITLE.append('Full-Mol Wiener Index (MFF-Conju)')  # 5.
            CONJU_TITLE.append('Individual Conju-Atom Number (MFF-Conju)')  # 6.
            CONJU_TITLE.append('Conju-Part-Wt (MFF-Conju)')  # 7.
            CONJU_TITLE.append('Conju-AtomicWt (MFF-Conju)')  # 8.
            CONJU_TITLE.append('Max Conju-Distance (MFF-Conju)')  # 9.
            CONJU_TITLE.append('Conju-Branch Index (MFF-Conju)')  # 10.
            CONJU_TITLE.append('Conju-Stru Wiener Index (MFF-Conju)')  # 11.
            CONJU_TITLE.append('Conju-Stru-VSA (MFF-Conju)')  # 12.
        # 1.共轭结构数量
        data_conju[_, 0] = len(ring_list)
        # 2.共轭原子总数
        data_conju[_, 1] = len(f_a)
        # 3.共轭结构数量占比
        data_conju[_, 2] = len(f_a)/len(atoms)
        # 4.共轭结构质量占比
        rval = []
        for desc_name, function in descList:
            rval.append(function(mol))
        wt_list = []
        mwt_list = []
        for r in ring_list:
            tt = 0
            for a in r:
                tt += atoms[a].GetMass()
            wt_list.append(tt)
            mwt_list.append(tt/len(r))
        data_conju[_, 3] = sum(wt_list)/rval[0]
        # 5.全分子维纳指数
        if '.' in smi:
            smi22 = max(smi.split('.'), key=len)
            mol22 = Chem.MolFromSmiles(smi22)
            dm22 = Chem.rdmolops.GetDistanceMatrix(mol22)
            data_conju[_, 4] = np.sum(dm22)/(2*dm22.shape[0]*(dm22.shape[0]-1))
        else:
            data_conju[_, 4] = np.sum(d_m)/(2*d_m.shape[0]*(d_m.shape[0]-1))
        
        ## 共轭结构独立特征
        conju_props = []
        # 6.共轭结构独立原子数
        size_l = []
        for r in ring_list:
            size_l.append(len(r))
        conju_props.append(size_l)
        
        # 7.共轭结构独立质量
        # 8.共轭结构独立原子平均质量
        mwt_list=np.array(mwt_list)
        mwt_list=mwt_list-12
        conju_props.append(wt_list)
        conju_props.append(mwt_list)
        # 9. 共轭结构长度
        conju_max_dis = []
        for r in ring_list:
            conju_max_dis.append(np.max(d_m[r, :][:, r]))
        conju_props.append(conju_max_dis)
        
        # 10.共轭结构分支系数
        if len(lstmax)==0:
            conjuratio=0
            conju_props.append(conjuratio)
        else:
            branch_l = []
            for i in range(len(wt_list)):
                branch_l.append(np.sum(a_m[ring_list[i], :][:, ring_list[i]])/(2*size_l[i]))
            for i in range(len([lstmax])):
                d_mring=d_m[lstmax, :][:,lstmax] 
            dlist=np.where(d_mring==np.max(d_mring))
            l_dlist=len(dlist[0])/2
            ltemp=dlist[0][0:round(l_dlist)]
            distemp=np.max(ltemp)-np.min(ltemp)
            conjuratio=distemp/conju_max_dis
            conju_props.append(conjuratio)
        # 11.共轭结构维纳指数
        wi_l = []
        for r in ring_list:
            d_m_temp = d_m[r, :][:, r]
            wi_l.append(np.sum(d_m_temp)/(2*d_m_temp.shape[0]*(d_m_temp.shape[0]-1)))
        conju_props.append(wi_l)        
        # 12.共轭结构VSA
        conju_vsa_l = []
        for r in ring_list:
            vsa_t = 0
            for i in range(len(r)):
                vsa_tt = 0
                atom = atoms[r[i]]
#                 print(r[i])
                n_l = atom.GetNeighbors()
                aid_1 = atom.GetAtomicNum()
                ar_1 = ri_m[aid_1, 0]
                for j in range(len(n_l)):
                    aid_2 = n_l[j].GetAtomicNum()
                    ar_2 = ri_m[aid_2, 0]
                    dij_i = dij_m[aid_1, aid_2]
                    dij = min(max(abs(ar_1-ar_2), dij_i), ar_1+ar_2)
                    vsa_tt += (ar_2**2-(ar_1-dij)**2)/dij
                vsa_t += 4*np.pi*ar_1**2 - np.pi*ar_1*vsa_tt
            conju_vsa_l.append(vsa_t)
        conju_props.append(conju_vsa_l)
        outprop=[]
        # 接下来计算原子描述符的性质
        for __ in range(len(atom_props)):
            PRE = 'Conju-'+CONJU_PRE[__]+'-'
            END_P = ' (MFF-Conju)'
            atom_props_list = atom_props[__]
            if _ == 0:
                CONJU_TITLE.append(PRE+'Sum'+END_P)  # 13.1.
                CONJU_TITLE.append(PRE+'AtomicMean'+END_P)  # 13.2.
                CONJU_TITLE.append(PRE+'Maximum'+END_P)  # 13.3.
                CONJU_TITLE.append(PRE+'Minimum'+END_P)  # 13.4.
                CONJU_TITLE.append(PRE+'Delta'+END_P)  # 13.5. 
                CONJU_TITLE.append(PRE+'STD'+END_P)  # 13.                
                CONJU_TITLE.append(PRE+'MaxMinDisRatio'+END_P)  # 13.14.
                CONJU_TITLE.append(PRE+'CONJUMAX'+END_P)  # 13.                
            # 13.1. 不含碎片的求和
            x_count_l = []
            # 13.2. 不含碎片的按原子平均的平均值
            x_atom_mean_l = []
            # 13.3. 含碎片的最大值
            x_max_l = []
            # 13.4. 含碎片的最小值
            x_min_l = []
            # 13.5. 含碎片的差值
            x_delta_l = []
            # 13.6. 含碎片的标准差
            x_std_l = []
            # 13.7. 最大共轭的值
            conjfrag_x=[]
            # 13.8. 含碎片的极值-极值距离占比
            x_dis_ratio_l = []

            # 开始计算性质
            for r in ring_list:
                # 整合1.2.
                a_p_l = [atom_props_list[a] for a in r]
                x_count_l.append(sum(a_p_l))  # 1.
                x_atom_mean_l.append(sum(a_p_l)/len(r))  # 2.
                x_std_l.append(np.std(a_p_l))
                # 生成碎片性质列表
                frag_x = []
                frag_atom_id = []
                pattlist=[]
                for i in range(len(mff_title)):
                    patt = mff_title[i]
                    f = Chem.MolFromSmarts(patt)
                    atomids = mol.GetSubstructMatches(f)
                    atomids = [list(x) for x in list(atomids)]
                    if len(atomids) > 0:
                        for j in range(len(atomids)):
                            peoe_flag = True
                            for k in atomids[j]:
                                if k not in r:
                                    peoe_flag = False
                                    break
                            if peoe_flag:
                                for kk in atomids[j]:
                                    neighbors = [x.GetIdx() for x in mol.GetAtomWithIdx(kk).GetNeighbors()]
                                    for neighbor in neighbors:
                                        if mol.GetAtomWithIdx(neighbor).GetSymbol() == 'H' and mol.GetAtomWithIdx(neighbor) not in atomids[j]:
                                            atomids[j].append(mol.GetAtomWithIdx(neighbor).GetIdx())
                                frag_atom_id.append(atomids[j])
                                x_temp = 0
                                for k in atomids[j]:
                                    x_temp += atom_props_list[k]
                                frag_x.append(x_temp)
                                pattlist.append(patt)
#                 print('pattlist',pattlist)
#                 print(frag_x)
                for j in range(len(r)):
                    atom_id = r[j]
                    frag_x.append(atom_props_list[atom_id])
                    frag_atom_id.append([atom_id])    
#                 print('frag_atom_id',frag_atom_id)
#                 print('frag_x',frag_x)
                x_max_l.append(max(frag_x))  # 3.
                x_min_l.append(min(frag_x))  # 4.
                x_delta_l.append(max(frag_x) - min(frag_x))  # 5.                
                f_1 = [i for i in frag_atom_id[frag_x.index(min(frag_x))]]
                f_2 = [i for i in frag_atom_id[frag_x.index(max(frag_x))]]
                s = np.max(d_m[f_1, :][:, f_2])/np.max(d_m[r, :][:, r])
                x_dis_ratio_l.append(s)  # 14.
                # 计算影响力

            conju_props.append(x_count_l)
            conju_props.append(x_atom_mean_l)
            conju_props.append(x_max_l)
            conju_props.append(x_min_l)
            conju_props.append(x_delta_l)
            conju_props.append(x_std_l)
            conju_props.append(x_dis_ratio_l)

            for r in ring_list:
                pattlist=[]
                x_temp = 0
                for k in r:
                    x_temp += atom_props_list[k]
                conjfrag_x.append(x_temp)
                patt='conj_patt'
                pattlist.append(patt)    
            conju_props.append(conjfrag_x)

        # 开始填入特征
        for i in range(len(conju_props)):
            index = 5 + i
            if len(conju_size_list)==0: #%如果真个分子都没有共轭的部分，则共轭描述符全为0
                data_conju[_, index]=0
            else:
                if KEEP_TYPE == 'max':
                    data_conju[_, index] = conju_props[i][conju_size_list.index(max(conju_size_list))]
                elif KEEP_TYPE == 'mean':
                    data_conju[_, index] = mean(conju_props[i])
                elif KEEP_TYPE == 'acc-mean':
                    temp = 0
                    for j in range(len(conju_size_list)):
                        temp += conju_size_list[j] * conju_props[i][j]
                    data_conju[_, index] = temp / sum(conju_size_list)
    X_out.append(data_conju)
    title_out.append(np.array(CONJU_TITLE).reshape(len(CONJU_TITLE), 1))

O=C(C1=C(C(O)=O)C=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=C(C(O)=O)C=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=CC=C(C(O)=O)C(C(O)=O)=C1)O
O=C(C1=CC(C(O)=O)=CC(C(O)=O)=C1)O
O=C(C1=CC=CC(C(O)=O)=C1C(O)=O)O
O=C(C1=C(C(O)=O)C=CC=N1)O
O=C(C1=CC=C(C(O)=O)C=C1)O
CS
Cl[Fe]123<-N4=C5C=CC4=C(c4ccccc4)c4ccc(n41)C(c1ccccc1)=C1C=CC(=N->21)C(c1ccccc1)=c1ccc(n13)=C5c1ccccc1
C1=CC2=N3->[Ni]45<-N6=C(C=CC6=C(c6ccccc6)c6ccc(n64)C(c4ccccc4)=C13)C(c1ccccc1)=c1ccc(n15)=C2c1ccccc1
C1(/C(C2=CC=CC=C2)=C(C=C/3)\NC3=C4\C5=CC=CC=C5)=N/C(C=C1)=C(C6=CC=CC=C6)\C7=CC=C(/C(C8=CC=CC=C8)=C9C=CC4=N/9)N7
Fc1c(F)c(F)c(C2=C3C=CC4=N3->[Fe]35(Cl)<-N6=C(C=CC6=C(c6c(F)c(F)c(F)c(F)c6F)c6ccc2n63)C(c2c(F)c(F)c(F)c(F)c2F)=c2ccc(n25)=C4c2c(F)c(F)c(F)c(F)c2F)c(F)c1F
COc1ccc(C2=C3C=CC4=N3->[Fe]35(Cl)<-N6=C(C=CC6=C(c6ccc(OC)cc6)c6ccc2n63)C(c2ccc(OC)cc2)=c2ccc(n25)=C4c2ccc(OC)cc2)cc1
C1=CC2=N3->[Co]45<-N6=C(C=CC6=C(c6ccccc6)c6ccc(n64)C(c4ccccc4)=C13)C(c1ccccc1)=c1ccc(n15)=C2c1ccccc1
CC1=CC=C(N2C=CC(C3C=CN(C4=CC=C(C)C=C4)C=C3)C=C2)C=C1.CC1=CC=C(N2C(C3C=CC=C



OCC(O)=O
OC(=O)c1ccc(nc1)c2ccc(C(O)=O)c(O)n2
OC(=O)c1ccncc1
O=C(C1=CC=NC=C1)O
N[C@@H](CS)C(O)=O
C1(C2=NC=CC=C2)=CC=CC=N1
c1ccc(cc1)c2ccccn2
Cc1cc(CO)c(O)c(CO)c1
C1(C2=NC(C3=NC=CC=C3)=CC=C2)=CC=CC=N1
OC(=O)c1ccccc1O
OC(=O)C#CC(O)=O
Nc1cccnc1
NCCCC(O)=O
Oc1ccccc1O
OCC(CO)(CO)NCC(O)=O
NC1=CC(C2=CC=C(N)C(N)=C2)=CC=C1N
Nc1ncnc2nc[nH]c12
OC(=O)/C=C/C(O)=O
OC(=O)c1cccc(c1)C(O)=O
O=C1C(O)=C(O)[C@@]([H])([C@@H](O)CO)O1
CC(C)(CO)C(O)=O
Nc1cc(ccc1C(O)=O)C(O)=O
c1cc(ccn1)c2ccncc2
CC(N)=S
OC1C(O)C(O)C(O)C(O)C1O
CC(=O)N[C@H]1C(O)O[C@H](CO)[C@@H](O)[C@@H]1O
O[C@H]1CN[C@H](C1)C(O)=O
NC1(CCCCC1)C(O)=O
O[P](O)(=O)c1ccncc1
OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O
OCC(CO)(CO)CO
OCC(O)=O
CC(O)C(O)=O
Oc1ccncc1
OCc1ccccc1
O=S(C1=CC(P(C2=CC=CC(S(=O)([O-])=O)=C2)C3=CC=CC(S(=O)([O-])=O)=C3)=CC=C1)([O-])=O
ON1C(=O)c2ccccc2C1=O
O[P](=O)(c1ccccc1)c2ccccc2
COC(=O)c1cc(OC)ccn1
COc1ccnc(c1)c2cc(OC)ccn2
Oc1cccc[n+]1[O-]
C#CCCCCO
OCc1occc1
COC(=O)c1ccc(nc1)C(O)=O
CC(N)CO
O[C@@H]([C@H](O)C(O)=O)C(O)=O
NCCCC[C@H](N)C(O)=O


In [21]:
#非共轭描述符特征器模块
if nonconjmolcount!=False:
    NONCONJU_TITLE = []
    NONCONJU_PRE = [ 'PEOE-Charge', 'Atomic-LogP', 'Atomic-MR']
    NONCONJU_NUM = 3 + 6 * len(NONCONJU_PRE)
    nondata_conju = np.zeros((len(smiles_rd), NONCONJU_NUM))
    descList = []
    allowedDescriptors = ['MolWt']
    for descriptor, function in Descriptors.descList:
        if descriptor in allowedDescriptors:
            descList.append((descriptor, function))
    mff_title = realtitle
    for _ in range(len(smiles_rd)):
        smi = smiles_rd[_]
        if '.' in smi:
            smi = max(smi.split('.'), key=len)
        mol = Chem.MolFromSmiles(Chem.MolToSmiles(Chem.MolFromSmiles(smi)))    
        mol=Chem.AddHs(mol)
        atoms = mol.GetAtoms()
        a_m = Chem.rdmolops.GetAdjacencyMatrix(mol)
        d_m = Chem.rdmolops.GetDistanceMatrix(mol)
        nonconjlist=[list(range(len(d_m)))]
        nonconju_size_list = [len(r) for r in nonconjlist]
        ## 计算原子级描述符列表：
        # 计算PEOE电荷
        AllChem.ComputeGasteigerCharges(mol, nIter=25)
        peoe_charge = [mol.GetAtomWithIdx(i).GetDoubleProp('_GasteigerCharge') for i in range(mol.GetNumAtoms())]
        # 计算LogP与MR的原子贡献
        contribs = rdMolDescriptors._CalcCrippenContribs(mol)
        logp = [contribs[i][0] for i in range(len(contribs))]
        mr = [contribs[i][1] for i in range(len(contribs))]
        # 结合各原子级描述符
        atom_props = [peoe_charge,  logp, mr]        
        ## 共轭结构的总和特征：
        if _ == 0:
            NONCONJU_TITLE.append('Max Distance')  # 9.
            NONCONJU_TITLE.append('Branch Index')  # 10.
            NONCONJU_TITLE.append('Stru-VSA')  # 12.
#         ## 共轭结构独立特征
        nonconju_props = []

        # 9. 共轭结构长度
        nonconju_max_dis = []

        for r in nonconjlist:
            nonconju_max_dis.append(np.max(d_m[r, :][:, r]))
#         print(nonconju_max_dis)
        nonconju_props.append(nonconju_max_dis)
        dlist=np.where(d_m==np.max(d_m))
        l_dlist=len(dlist[0])/2
        if len(d_m)==1:
            nonconjuratio=0
            nonconju_props.append(nonconjuratio)
        else:
            ltemp=dlist[0][0:round(l_dlist)]
            distemp=np.max(ltemp)-np.min(ltemp)
            nonconjuratio=distemp/nonconju_max_dis
            nonconju_props.append(nonconjuratio)
        
        # 12.共轭结构VSA
        nonconju_vsa_l = []
        for r in nonconjlist:
            vsa_t = 0
            for i in range(len(r)):
                vsa_tt = 0
                atom = atoms[r[i]]
                n_l = atom.GetNeighbors()
                aid_1 = atom.GetAtomicNum()
                ar_1 = ri_m[aid_1, 0]
                for j in range(len(n_l)):
                    aid_2 = n_l[j].GetAtomicNum()
                    ar_2 = ri_m[aid_2, 0]
                    dij_i = dij_m[aid_1, aid_2]
                    dij = min(max(abs(ar_1-ar_2), dij_i), ar_1+ar_2)
                    vsa_tt += (ar_2**2-(ar_1-dij)**2)/dij
                vsa_t += 4*np.pi*ar_1**2 - np.pi*ar_1*vsa_tt
            nonconju_vsa_l.append(vsa_t)
        nonconju_props.append(nonconju_vsa_l)
        print(nonconju_props)
        outprop=[]
        # 接下来计算原子描述符的性质
        for __ in range(len(atom_props)):
            PRE = ''+NONCONJU_PRE[__]+'-'
            END_P = ''
            atom_props_list = atom_props[__]
            if _ == 0:
                NONCONJU_TITLE.append(PRE+'Sum'+END_P)  # 13.1.
                NONCONJU_TITLE.append(PRE+'AtomicMean'+END_P)  # 13.2.
                NONCONJU_TITLE.append(PRE+'Maximum'+END_P)  # 13.3.
                NONCONJU_TITLE.append(PRE+'Minimum'+END_P)  # 13.4.
                NONCONJU_TITLE.append(PRE+'Delta'+END_P)  # 13.5. 
                NONCONJU_TITLE.append(PRE+'STD'+END_P)  # 13.6.
            # 13.1. 不含碎片的求和
            x_count_l = []
            # 13.2. 不含碎片的按原子平均的平均值
            x_atom_mean_l = []
            # 13.3. 含碎片的最大值
            x_max_l = []
            # 13.4. 含碎片的最小值
            x_min_l = []
            # 13.5. 含碎片的差值
            x_delta_l = []
            # 13.6. 含碎片的标准差
            x_std_l = []
            # 开始计算性质
            for r in nonconjlist:
                # 整合1.2.
                a_p_l = [atom_props_list[a] for a in r]
                res = [] 
                for val in a_p_l: 
                    if val != None : 
                        res.append(val)
                a_p_l=res
                x_count_l.append(sum(a_p_l))  # 1.
                x_atom_mean_l.append(sum(a_p_l)/len(r))  # 2.
                x_std_l.append(np.std(a_p_l))# 3.
                # 生成碎片性质列表
                frag_x = []
                frag_atom_id = []
                pattlist=[]
                for i in range(len(mff_title)):
                    patt = mff_title[i]
                    f = Chem.MolFromSmarts(patt)
                    atomids = mol.GetSubstructMatches(f)
                    atomids = [list(x) for x in list(atomids)]
                    if len(atomids) > 0:
                        for j in range(len(atomids)):
                            peoe_flag = True
                            for k in atomids[j]:
                                if k not in r:
                                    peoe_flag = False
                                    break
                            if peoe_flag:
                                for kk in atomids[j]:
                                    neighbors = [x.GetIdx() for x in mol.GetAtomWithIdx(kk).GetNeighbors()]
                                    for neighbor in neighbors:
                                        if mol.GetAtomWithIdx(neighbor).GetSymbol() == 'H' and mol.GetAtomWithIdx(neighbor) not in atomids[j]:
                                            atomids[j].append(mol.GetAtomWithIdx(neighbor).GetIdx())
                                frag_atom_id.append(atomids[j])
                                x_temp = 0
                                for k in atomids[j]:
                                    x_temp += atom_props_list[k]
                                frag_x.append(x_temp)
                                pattlist.append(patt)
#                 print('pattlist',pattlist)
                for j in range(len(r)):
                    atom_id = r[j]
                    frag_x.append(atom_props_list[atom_id])
                    frag_atom_id.append([atom_id])
#                 print('frag_x',frag_x)
                x_max_l.append(max(frag_x))  # 3.
                x_min_l.append(min(frag_x))  # 4.
                x_delta_l.append(max(frag_x) - min(frag_x))  # 5.        

            nonconju_props.append(x_count_l)
            nonconju_props.append(x_atom_mean_l)
            nonconju_props.append(x_max_l)
            nonconju_props.append(x_min_l)
            nonconju_props.append(x_delta_l)
            nonconju_props.append(x_std_l)
            
            if ALLPROPOUTPUT==True:            
                outprop.append(pattlist)
                outprop.append(frag_atom_id)
                outprop.append(frag_x)
                save_name = 'AllProp'+str(_)+'_'+'.csv'
                save_name = Path('.', DIR, save_name)
                np.savetxt(save_name, outprop, fmt='%s', delimiter=',')
        # 开始填入特征
        for i in range(len(nonconju_props)):
            index = i
            if len(nonconju_size_list)==0: #%如果真个分子都没有共轭的部分，则共轭描述符全为0
                nondata_conju[_, index]=0
            else:
                if KEEP_TYPE == 'max':
                    nondata_conju[_, index] = nonconju_props[i][nonconju_size_list.index(max(nonconju_size_list))]

    X_out.append(nondata_conju)
    title_out.append(np.array(NONCONJU_TITLE).reshape(len(NONCONJU_TITLE), 1))

[[9.0], array([0.22222222]), [292.5678161072967]]
[[9.0], array([0.22222222]), [292.5678161072967]]
[[9.0], array([0.]), [247.728470296663]]
[[8.0], array([0.25]), [247.728470296663]]
[[8.0], array([0.]), [247.728470296663]]
[[7.0], array([0.28571429]), [201.17381489716925]]
[[9.0], array([0.]), [202.88912448602926]]


  X = np.asarray(X)


[[3.0], array([0.66666667]), [77.7982848333813]]
[[16.0], array([0.4375]), [720.0019664726038]]
[[16.0], array([0.4375]), [667.5004742935305]]
[[18.0], array([0.44444444]), [667.5004742935305]]
[[16.0], array([1.6875]), [848.7609063461574]]
[[20.0], array([0.65]), [855.9379650836242]]
[[16.0], array([0.4375]), [667.5004742935305]]
[[19.0], array([0.10526316]), [398.6869572964661]]
[[21.0], array([0.0952381]), [428.9181456470557]]
[[17.0], array([0.]), [410.9622788433073]]
[[17.0], array([0.]), [360.9501463415456]]
[[19.0], array([0.05263158]), [396.9716477076061]]
[[21.0], array([0.23809524]), [469.01465043972723]]
[[19.0], array([0.10526316]), [436.4237682513867]]
[[19.0], array([0.10526316]), [475.0514502134527]]
[[19.0], array([0.10526316]), [549.6342011161485]]
[[19.0], array([0.10526316]), [466.6549566019763]]
[[19.0], array([0.10526316]), [506.7514613946478]]
[[25.0], array([0.]), [524.3946457372081]]
[[19.0], array([0.10526316]), [398.6869572964661]]
[[17.0], array([0.]), [410.9

[[10.0], array([1.8]), [498.18711085617275]]
[[7.0], array([0.14285714]), [178.48805910862333]]
[[10.0], array([0.]), [247.8745138940074]]
[[9.0], array([0.22222222]), [211.31699822545897]]
[[11.0], array([0.18181818]), [262.0430270736733]]
[[6.0], array([0.]), [137.81259129329618]]
[[8.0], array([0.]), [155.11112181498893]]
[[6.0], array([0.]), [122.443091729183]]
[[10.0], array([0.2]), [222.17234438333762]]
[[5.0], array([1.]), [116.51665606563833]]
[[7.0], array([0.]), [182.30166169300196]]
[[9.0], array([0.11111111]), [204.11809334763623]]
[[10.0], array([0.2]), [239.28193991926682]]
[[7.0], array([0.]), [139.18137319793522]]
[[7.0], array([0.]), [174.34521976956577]]
[[10.0], array([0.]), [226.7828104799331]]
[[9.0], array([0.]), [202.89287221273622]]
[[8.0], array([0.125]), [191.49831565816604]]
[[9.0], array([0.]), [202.8966199394432]]
[[8.0], array([0.25]), [197.9167136390242]]
[[7.0], array([0.28571429]), [195.78658963031612]]
[[8.0], array([0.625]), [273.69941062983645]]
[[5.

In [23]:
if SOLVENT_SWITCH:
    sol_d = np.loadtxt(SOLVENT_IN, dtype=str, delimiter=',', skiprows=1)
    df = pd.read_csv(SOLVENT_IN)
    SOLVENT_TITLE  = list(df.columns.values)
    X_out.append(sol_d)
    title_out.append(np.array(SOLVENT_TITLE).reshape(len(SOLVENT_TITLE), 1))

In [24]:
# 将所有对应的数据包叠加起来，得到一个大数据包
X_init = X_out[0]
if len(X_out)>=2:
    for i in range(len(X_out)-1):
        X_init = np.hstack((X_init, X_out[i+1]))
title_init = title_out[0]
if len(title_out)>=2:
    for i in range(len(title_out)-1):
        title_init = np.vstack((title_init, title_out[i+1]))
title_init = np.transpose(title_init)
print(X_init.shape, title_init.shape)
title_extra = ['smiles', 'values']
print(len(X_init[0]))
data_out = np.hstack((X_init, smiles_out))
print(len(data_out[0]))
data_out = np.hstack((data_out, values_out))
print(len(data_out[0]))
if VALUES_DIV_MOLWT:
    title_extra.append('values_dwt')
    data_out = np.hstack((data_out, values_dwt_out))
if VALUES_LN:
    title_extra.append('values_ln')
    data_out = np.hstack((data_out, np.log(values_out)))
    if VALUES_DIV_MOLWT:
        title_extra.append('values_dwt_ln')
        data_out = np.hstack((data_out, np.log(values_dwt_out)))
title_extra = np.array(title_extra).reshape(1, len(title_extra))
title_out = np.hstack((title_init, title_extra))
full_data = np.vstack((title_out, data_out))
print(full_data.shape)

(249, 307) (1, 307)
307
308
309
(250, 309)


In [25]:
# 删除含有nan或infi的行，以及只有0的列
del_list = []
delmolifwrong=False
if delmolifwrong==True:
    for i in range(full_data.shape[0]-1):
        j = i+1
        if (np.isnan(full_data[j, :-5].astype(float)).sum()>0)or(np.inf in full_data[j, :-5].astype(float)):
            del_list.append(j)
            continue
    print(del_list)
    full_data = np.delete(full_data, del_list, axis=0)
else:
    del_list = []
    for i in range(X_init.shape[1]-2):
        if (np.isnan(full_data[1:, i].astype(float)).sum()>0)or(np.inf in full_data[1:, i].astype(float)):
            del_list.append(i)
            continue
    print(full_data[0, del_list])
full_data = np.delete(full_data, del_list, axis=1) 
delnum=len(del_list)
del_list = []
for i in range(X_init.shape[1]-delnum):
    if max(full_data[1:, i].astype(float))==min(full_data[1:, i].astype(float)):
        del_list.append(i)
        continue
print(full_data[0, del_list])
full_data = np.delete(full_data, del_list, axis=1)
print(full_data.shape)


['MaxPartialCharge' 'MinPartialCharge' 'MaxAbsPartialCharge'
 'MinAbsPartialCharge' 'Conju-PEOE-Charge-Sum (MFF-Conju)'
 'Conju-PEOE-Charge-AtomicMean (MFF-Conju)'
 'Conju-PEOE-Charge-Maximum (MFF-Conju)'
 'Conju-PEOE-Charge-Minimum (MFF-Conju)'
 'Conju-PEOE-Charge-Delta (MFF-Conju)' 'Conju-PEOE-Charge-STD (MFF-Conju)'
 'Conju-PEOE-Charge-CONJUMAX (MFF-Conju)' 'PEOE-Charge-Sum'
 'PEOE-Charge-AtomicMean' 'PEOE-Charge-Maximum' 'PEOE-Charge-Minimum'
 'PEOE-Charge-Delta' 'PEOE-Charge-STD']
['Allene' 'Alkylchloride' 'Alkylbromide' 'Alkyliodide' 'Diarylthioether'
 'Oxonium' 'Secondary_arom_amine' 'Quaternary_arom_ammonium'
 'Dialkylthioether' 'Alkylarylthioether' 'Disulfide' '1comma1-Diol'
 'Hydroperoxide' 'Peroxo' 'Organometallic_compounds' 'Thioaldehyde'
 'Thioketone' 'Imine' 'Oxime' 'Oximether' 'Acetal' 'Hemiaminal'
 'Thioacetal' 'Thiohemiacetal' 'Chloroalkene' 'Bromoalkene' 'Iodoalkene'
 'Acylhalide' 'Carboxylic_anhydride' 'Thioacetate' 'Ethanethioic' 'Lactam'
 'Ketene' 'Nitrile' 'Isonit

In [26]:
# 将大数据包拆分开来
OUT_NAME_FULL = 'Full_'+str(full_data.shape[0])+'_'+str(full_data.shape[1])+'.csv'
OUT_NAME_FULL = Path('.', DIR, OUT_NAME_FULL)
np.savetxt(OUT_NAME_FULL, full_data, fmt='%s', delimiter=',')
data_t = full_data[1:, :]
full_t = full_data
if VALUES_LN:
    if VALUES_DIV_MOLWT:
        values_out_wt_ln = data_t[:, -1]
        OUT_NAME_VALUES_WT_LN = 'Values_True_w_ln_'+str(values_out_wt_ln.shape[0])+'.csv'
        OUT_NAME_VALUES_WT_LN = Path('.', DIR, OUT_NAME_VALUES_WT_LN)
        np.savetxt(OUT_NAME_VALUES_WT_LN, values_out_wt_ln, fmt='%s', delimiter=',')
        data_t = data_t[:, :-1]
        full_t = full_t[:, :-1]
    values_out_ln = data_t[:, -1]
    OUT_NAME_VALUES_LN = 'Values_True_ln_'+str(values_out_ln.shape[0])+'.csv'
    OUT_NAME_VALUES_LN = Path('.', DIR, OUT_NAME_VALUES_LN)
    np.savetxt(OUT_NAME_VALUES_LN, values_out_ln, fmt='%s', delimiter=',')
    data_t = data_t[:, :-1]
    full_t = full_t[:, :-1]
if VALUES_DIV_MOLWT:
    values_out_wt = data_t[:, -1]
    OUT_NAME_VALUES_WT = 'Values_True_w_'+str(values_out_wt.shape[0])+'.csv'
    OUT_NAME_VALUES_WT = Path('.', DIR, OUT_NAME_VALUES_WT)
    np.savetxt(OUT_NAME_VALUES_WT, values_out_wt, fmt='%s', delimiter=',')
    data_t = data_t[:, :-1]
    full_t = full_t[:, :-1]
values_out = data_t[:, -1]
OUT_NAME_VALUES = 'Values_True_'+str(values_out.shape[0])+'.csv'
OUT_NAME_VALUES = Path('.', DIR, OUT_NAME_VALUES)
np.savetxt(OUT_NAME_VALUES, values_out, fmt='%s', delimiter=',')
data_t = data_t[:, :-1]
full_t = full_t[:, :-1]
smiles_out = data_t[:, -1]
OUT_NAME_SMILES = 'Smiles_'+str(values_out.shape[0])+'.csv'
OUT_NAME_SMILES = Path('.', DIR, OUT_NAME_SMILES)
np.savetxt(OUT_NAME_SMILES, smiles_out, fmt='%s', delimiter=',')
X_out = data_t[:, :-1]
full_t = full_t[:, :-1]
OUT_NAME_X = 'Features_'+str(X_out.shape[0])+'_'+str(X_out.shape[1])+'.csv'
OUT_NAME_X = Path('.', DIR, OUT_NAME_X)
np.savetxt(OUT_NAME_X, X_out, fmt='%s', delimiter=',')
OUT_NAME_TITLE = 'Title_'+str(full_t.shape[1])+'.csv'
OUT_NAME_TITLE = Path('.', DIR, OUT_NAME_TITLE)
np.savetxt(OUT_NAME_TITLE, np.transpose(full_t[0, :]), fmt='%s', delimiter=',')

In [27]:
# 输出Log文件
LOG_NAME = Path('.', DIR, LOG_NAME)
f1 = open(LOG_NAME, 'w+')
f1.write('Log for CMF\n')
f1.write('Version: V'+VERSION+'\n\n')
f1.write('Log generation time: '+time.strftime("%Y.%m.%d-%H:%M:%S", time.localtime())+'\n\n')
f1.write('Input smiles file is: '+INPUT_NAME+'\n\n')
f1.write('Dependent loffi file is'+loffitxt+'\n\n')
f1.write('All title are'+str(title_out)+'\n\n')
f1.write('Feature Matrix shape is'+str(full_data.shape)+'\n\n')
if SMILES_CHECK:
    f1.write('Smiles check is on.\n')
if VALUES_DIV_MOLWT:
    f1.write('Data with values divided by molecular weight is generated.\n')
if VALUES_LN:
    f1.write('Data with ln(values) is generated.\n')
if PANDAS_DATASET_GENERATE:
    f1.write('Dataset(s) which can be used by pandas is(are) generated.\n')
f1.write('\n\n')
f1.write('Parameters:\n\n')
if RDKIT_DESC_SWITCH:
    f1.write('RDKit Descriptor featurizer is on.\n')
    f1.write('Length of allowed descriptor set: '+str(len(allowedDescriptors))+'\n')
    f1.write('Allowed descriptors:\n')
    for i in range(len(allowedDescriptors)):
        if (i)%5 == 0:
            f1.write('    ')
        f1.write(allowedDescriptors[i]+'   ')
        if (i+1)%5 == 0:
            f1.write('\n')
    f1.write('\n')
f1.write('Some samples may be deleted beacause of the presence of nan or infinite.\n')
f1.write('Some columns may be deleted beacause of only 0 is contained.\n')
f1.close()