In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pandas as pd
from rdkit.Chem import MolStandardize,rdmolops,MACCSkeys,ChemicalFeatures,AllChem,Descriptors,Lipinski
from rdkit.Chem.Descriptors import rdMolDescriptors
from rdkit import Chem
from matplotlib.font_manager import FontProperties
import matplotlib.ticker as ticker

In [74]:
database = pd.read_csv('**.csv',encoding='gbk')
smiles = list(database.iloc[:,0])

In [None]:
# the number of rings
def cal_rings(smi_list):
    ring_list = []
    for smi in smi_list:
        m = Chem.MolFromSmiles(smi)
        ring_list.append(m.GetRingInfo().NumRings())
    print('maximum',max(ring_list),'minimum',min(ring_list),'average',sum(ring_list)/len(ring_list))
    return ring_list
ring = cal_rings(smiles)

# draw distribution
plt.figure(figsize=(8, 6),dpi=500)
plt.hist(ring, bins=np.arange(2, 10) - 0.5, color='#A8BCBA', edgecolor='black',align='mid') #50697D
plt.xticks(np.arange(2, 9, 2), fontsize=30,fontname='arial')  # 
def thousands_formatter(x, pos):
    return f'{int(x / 1000)}'
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(thousands_formatter))
plt.yticks(fontsize=30, fontname='arial')
plt.show()

In [None]:
# the number of atoms
def cal_atom_num(smi_list):
    count_list = []
    for smi in smi_list:
        m = Chem.AddHs(Chem.MolFromSmiles(smi))
        c = m.GetNumAtoms()
        count_list.append(c)
    print('maximum',max(count_list),'minimum',min(count_list),'average',sum(count_list)/len(count_list))
    return count_list
count = cal_atom_num(smiles)
# draw distribution
plt.figure(figsize=(8, 6),dpi=500)
plt.hist(count, bins=np.arange(15, 54) - 0.5, color='#D9CFE7', edgecolor='black',align='mid') #50697D
plt.xticks(np.arange(15, 53, 6), fontsize=30,fontname='arial')  #
def thousands_formatter(x, pos):
    return f'{int(x / 1000)}'
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(thousands_formatter))
plt.yticks(fontsize=30,fontname='arial')
plt.show()

In [None]:
# the number of rotatable bonds
def get_rotatable_bonds(smi_list):
    rotatable_bonds_num = []
    for smi in smi_list:
        mol = Chem.AddHs(Chem.MolFromSmiles(smi))
        rotatable_bonds_num.append(Lipinski.NumRotatableBonds(mol))
    print('maximum',max(rotatable_bonds_num),'minimum',min(rotatable_bonds_num),'average',sum(rotatable_bonds_num)/len(rotatable_bonds_num))
    return rotatable_bonds_num
count_bonds = get_rotatable_bonds(smiles)
# draw distributions
plt.figure(figsize=(8, 6),dpi=500)
#plt.hist(count_bonds, bins=np.arange(0, 2) - 0.5, color='#ACC66D', edgecolor='black',align='mid') #50697D
plt.hist(count_bonds, bins=5, color='#ACC66D', edgecolor='black',align='mid') #50697D
plt.xticks(np.arange(0, 1, 1), fontsize=30,fontname='arial')  # 
def thousands_formatter(x, pos):
    return f'{int(x / 1000)}'
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(thousands_formatter))
plt.yticks(fontsize=30,fontname='arial')
plt.show()

In [None]:
# the type of symbols
def cal_symbol(mol):
    symbol = []
    m = Chem.RemoveHs(mol)
    #
    for atom in mol.GetAtoms():
        symbol.append(atom.GetSymbol())
    symbol = set(symbol)
    return list(symbol)
def get_data_symbol(smi_list):
    symbol_list = []
    for smi in smi_list:
        symbol = cal_symbol(Chem.MolFromSmiles(smi))
        symbol_list.append(symbol)
    print(set(sum(symbol_list,[])))
    return 0
_ = get_data_symbol(smiles)