In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pandas as pd
from rdkit.Chem import MolStandardize,rdmolops,MACCSkeys,ChemicalFeatures,AllChem,Descriptors,Lipinski
from rdkit.Chem.Descriptors import rdMolDescriptors
from rdkit import Chem
from matplotlib.font_manager import FontProperties

In [None]:
database = pd.read_csv('**.csv') # put your own data path
smiles = list(database.iloc[:,0])

In [None]:
# the number of aromatic rings
def cal_rings(smi_list):
    ring_list = []
    for smi in smi_list:
        m = Chem.MolFromSmiles(smi)
        ring_info = m.GetRingInfo()
        atoms_in_rings = ring_info.AtomRings()
        #aromatic ring
        num_aromatic_ring = 0
        for ring in atoms_in_rings:
            aromatic_atom_in_ring = 0
            for atom_id in ring:
                atom = m.GetAtomWithIdx(atom_id)
                if atom.GetIsAromatic():
                    aromatic_atom_in_ring += 1
            if aromatic_atom_in_ring == len(ring):
                num_aromatic_ring += 1
        ring_list.append(num_aromatic_ring)
    print('maximum',max(ring_list),'minimum',min(ring_list),'average',sum(ring_list)/len(ring_list))
    return ring_list
ring = cal_rings(smiles)
# draw distribution
plt.figure(figsize=(8, 6))
plt.hist(ring, bins=5, color='#50697D', edgecolor='black')
plt.xticks(np.arange(0, 5, 1), fontsize=20)  # 
plt.yticks(fontsize=20)
plt.show()

In [None]:
# molecular weight 
def cal_molwt(smi_list):
    weight_list = []
    for smi in smi_list:
        m = Chem.AddHs(Chem.MolFromSmiles(smi))
        w = Descriptors.MolWt(m)
        weight_list.append(w)
    print('maximum',max(weight_list),'minimum',min(weight_list),'average',sum(weight_list)/len(weight_list))

    return weight_list
molwt = cal_molwt(smiles)
# draw distribution
plt.figure(figsize=(8, 6))
plt.hist(molwt, bins=12, color='#50697D', edgecolor='black')
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)
plt.show()

In [None]:
# the number of atoms
def cal_atom_num(smi_list):
    count_list = []
    for smi in smi_list:
        m = Chem.AddHs(Chem.MolFromSmiles(smi))
        c = m.GetNumAtoms()
        count_list.append(c)
    print('maximum',max(count_list),'minimum',min(count_list),'average',sum(count_list)/len(count_list))
    return count_list
count = cal_atom_num(smiles)
# draw distribution
plt.figure(figsize=(8, 6))
plt.hist(count, bins=16, color='#50697D', edgecolor='black')  #4 16
plt.xticks(np.arange(12, 36, 3), fontsize=20)  #
plt.yticks(fontsize=20)
plt.show()

In [None]:
# the number of rotatable bonds
def get_rotatable_bonds(smi_list):
    rotatable_bonds_num = []
    for smi in smi_list:
        mol = Chem.AddHs(Chem.MolFromSmiles(smi))
        rotatable_bonds_num.append(Lipinski.NumRotatableBonds(mol))
    print('maximum',max(rotatable_bonds_num),'minimum',min(rotatable_bonds_num),'average',sum(rotatable_bonds_num)/len(rotatable_bonds_num))
    return rotatable_bonds_num
count_bonds = get_rotatable_bonds(smiles)
# draw
plt.figure(figsize=(8, 6))
plt.hist(count_bonds, bins=2, color='#50697D', edgecolor='black')     #10
plt.xticks(np.arange(0, 2, 1), fontsize=20)  # 
plt.yticks(fontsize=20)
plt.show()

In [None]:
# type of elements
def cal_symbol(mol):
    symbol = []
    m = Chem.RemoveHs(mol)
    #
    for atom in mol.GetAtoms():
        symbol.append(atom.GetSymbol())
    symbol = set(symbol)
    return list(symbol)

def get_data_symbol(smi_list):
    symbol_list = []
    for smi in smi_list:
        symbol = cal_symbol(Chem.MolFromSmiles(smi))
        symbol_list.append(symbol)
    print(set(sum(symbol_list,[])))

    return 0
_ = get_data_symbol(smiles)