In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

In [2]:
# Define a function to calculate the molecular weight of the molecule, the fat water partition coefficient, the number of rotatable bonds, 
# the topological polar surface area, the number of rings, the number of aromatic rings, the number of hydrogen bond donors and the number of hydrogen bond acceptors
def calc_mol_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    tpsa = Descriptors.TPSA(mol)
    Rings = Descriptors.RingCount(mol)
    AroRings = Descriptors.NumAromaticRings(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)
    return mol_weight, logp, rotatable_bonds, tpsa,Rings, AroRings, h_donors, h_acceptors

In [7]:
data_smiles = pd.read_csv('GABAA.csv',encoding='gb18030')['smiles']

In [13]:
plant_smiles = pd.read_csv('plant.csv',encoding='gb18030')['smiles']

# data

In [8]:
data = {'smiles': [], 'weight': [], 'logp': [], 'rotatable_bonds': [], 'tpsa': [],'rings': [],'AroRings': [], 'h_donors': [], 'h_acceptors': []}

In [9]:
for smiles in data_smiles:
    mol_weight, logp, rotatable_bonds, tpsa,rings,AroRings,h_donors, h_acceptors = calc_mol_properties(smiles)
    data['smiles'].append(smiles)
    data['weight'].append(mol_weight)
    data['logp'].append(logp)
    data['rotatable_bonds'].append(rotatable_bonds)
    data['tpsa'].append(tpsa)
    data['rings'].append(rings)
    data['AroRings'].append(AroRings)
    data['h_donors'].append(h_donors)
    data['h_acceptors'].append(h_acceptors)

In [10]:
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,smiles,weight,logp,rotatable_bonds,tpsa,rings,AroRings,h_donors,h_acceptors
0,C(CC(=O)O)CN,103.121,-0.19010,3,63.32,0,0,2,2
1,CC1CCC(C(C1)O)C(C)C,156.269,2.43950,1,20.23,1,0,1,1
2,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,232.239,0.70040,2,75.27,2,1,2,3
3,C1CNCC2C1(O2)C(=O)O,143.142,-0.79810,1,61.86,2,0,2,3
4,CC1=C(CCCl)SC=N1,161.657,2.23282,2,12.89,1,1,0,2
...,...,...,...,...,...,...,...,...,...
399,CCOC(=O)C1=NC=C2C(=C1)C3=C(N2)C=CC(=C3)OCC4=CC...,346.386,4.47180,5,64.21,4,4,1,4
400,CCC1=CC2=C(C=C1)N3C=C(N=C3C=C2OC)C(=O)C4=CC=CC=N4,331.375,3.68450,4,56.49,4,4,0,5
401,CSC1=NC2=NC(=CN2C3=C1CCCC3)C(=O)C4=CC=CC=C4,323.421,3.56100,3,47.26,4,3,0,5
402,CC1C(=O)CC2(C(C)C)CC12,152.237,2.25760,1,17.07,2,0,0,1


In [11]:
df_data.to_csv('data_MolecularProperties.csv')

# plant

In [14]:
plant = {'smiles': [], 'weight': [], 'logp': [], 'rotatable_bonds': [], 'tpsa': [],'rings': [],'AroRings': [], 'h_donors': [], 'h_acceptors': []}

In [15]:
for smiles in plant_smiles:
    mol_weight, logp, rotatable_bonds, tpsa,rings,AroRings,h_donors, h_acceptors = calc_mol_properties(smiles)
    plant['smiles'].append(smiles)
    plant['weight'].append(mol_weight)
    plant['logp'].append(logp)
    plant['rotatable_bonds'].append(rotatable_bonds)
    plant['tpsa'].append(tpsa)
    plant['rings'].append(rings)
    plant['AroRings'].append(AroRings)
    plant['h_donors'].append(h_donors)
    plant['h_acceptors'].append(h_acceptors)

In [16]:
df_plant = pd.DataFrame(plant)
df_plant

Unnamed: 0,smiles,weight,logp,rotatable_bonds,tpsa,rings,AroRings,h_donors,h_acceptors
0,CC(=O)CCCC(C)CCCC(C)CCCC(C)C,268.485,6.0145,12,17.07,0,0,0,1
1,CC(=O)CC(C)(C)O,116.160,0.7364,2,37.30,0,0,1,2
2,CCCCCO,88.150,1.1689,3,20.23,0,0,1,1
3,CCCCCCCCCCCCCCCCCCCCCCCC,338.664,9.6084,21,0.00,0,0,0,0
4,CCCCCCCCCCCCCCCCCCCCCCC,324.637,9.2183,20,0.00,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2448,C=CC1(C)CC2OC(=O)C(C)=C2CC1C(=C)C(=O)OC,276.332,2.5598,3,52.60,2,0,0,4
2449,CC1(C)C2CC=C(CCO)C1C2,166.264,2.3612,2,20.23,3,0,1,1
2450,CC1C(=O)C2CC1C(C)(C)C2,152.237,2.2576,0,17.07,2,0,0,1
2451,CC(=O)C1CC(CC(=O)O)C1(C)C,184.235,1.7124,3,54.37,1,0,1,2


In [18]:
df_plant.to_csv('plant_MolecularProperties.csv')