In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
# загружаем исходные данные
df = pd.read_csv('6_good_smiles.csv')
df.head(3)

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/S0022-3093(99)00330-0,7/19/1999,Journal of Non-Crystalline Solids,PHOTOINDUCEDCHANGESINLINEARNONLINEAROPTICALPRO...,As20S60Ge20,0.0,,el_cde_tables,"[['As', 20.0], ['Ge', 20.0], ['S', 60.0]]",2.054,n
1,10.1016/j.fct.2006.05.017,6/7/2006,Food and Chemical Toxicology,GENERATIONFORMALDEHYDEINCIGARETTESOVERVIEWRECE...,Propionaldehyde,0.0,,el_cde_tables,CCC=O,3.74,n
2,10.1016/j.mee.2011.01.031,1/18/2011,Microelectronic Engineering,SINUSOIDALPLASMONICCRYSTALSFORBIODETECTIONSENSORS,PEO,0.0,,el_cde_text,OO,1.47,refractive index


In [3]:
# добавляем в датафрем колонку mol файлов, для тех соединений, для которых это возможно сделать
mols = [Chem.MolFromSmiles(smi) for smi in df['normalised_name'].tolist()]
df['mol_file'] = mols

[00:57:16] SMILES Parse Error: syntax error while parsing: [['As',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['As',' for input: '[['As','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['C',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['C',' for input: '[['C','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['Er',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['Er',' for input: '[['Er','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['Li',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['Li',' for input: '[['Li','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['As',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['As',' for input: '[['As','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['Cu',
[00:57:16] SMILES Parse Error: Failed parsing SMILES '[['Cu',' for input: '[['Cu','
[00:57:16] SMILES Parse Error: syntax error while parsing: [['He',
[00:57:16] SMILES Parse Error:

In [4]:
# разбиваем датафрейм на части, которые содержат mol file и None
df_nans = df.loc[df.mol_file.isna()]
df_yes = df.loc[df.mol_file.notna()]

In [7]:
# количество дескрипторов 
print(len(Descriptors._descList))

208


In [8]:
# зададим вычислитель дескрипторов
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])

In [9]:
# имена дескрипторов
desc_names = calc.GetDescriptorNames()

In [10]:
# выгрузим дескрипторы и создадим датафрейм
mol_descriptors = []

for mol in df_yes['mol_file']:
        descriptors = calc.CalcDescriptors(mol)
        mol_descriptors.append(descriptors)

df_molecular_desc = pd.DataFrame(mol_descriptors, columns=desc_names)
df_molecular_desc.shape



(3908, 208)

In [18]:
# объединяем датафреймы
df_with_desc = pd.concat([df_yes.reset_index(drop=True), df_molecular_desc], axis=1)

In [19]:
df_with_desc.shape

(3908, 220)

In [20]:
df_nans.reset_index(drop=True)

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier,mol_file
0,10.1016/S0022-3093(99)00330-0,7/19/1999,Journal of Non-Crystalline Solids,PHOTOINDUCEDCHANGESINLINEARNONLINEAROPTICALPRO...,As20S60Ge20,0.000,,el_cde_tables,"[['As', 20.0], ['Ge', 20.0], ['S', 60.0]]",2.054,n,
1,10.1039/C5PY01315A,12/22/2015,Polymer Chemistry,Ionic hydrogen bond donor organocatalyst for f...,OCH2CH2 –,0.000,,rsc_cde_text,"[['C', 2.0], ['H', 4.0], ['O', 1.0]]",4.24,n,
2,10.1016/j.ijleo.2014.07.129,9/12/2014,Optik,EVALUATIONOPTICALPARAMETERSTELLURITEGLASSES,WO3-xEr2O3,0.000,,el_mylogic,"[['Er', 2.0], ['O', 6.0], ['W', 1.0]]",1.988,n,
3,10.1016/j.jnoncrysol.2016.11.005,11/27/2016,Journal of Non-Crystalline Solids,CALCULATIONPHYSICALPROPERTIESGLASSVIAPHASEDIAG...,Li2O-SiO2,0.000,,el_mylogic,"[['Li', 2.0], ['O', 2.0], ['O-', 1.0], ['Si', ...",2.3443,Refractive index (nD),
4,10.1016/S1359-0286(02)00009-8,3/4/2002,Current Opinion in Solid State and Materials S...,INORGANICGLASSESKERRLIKEMEDIA,As2S3–GeS2–Sb2S3,0.000,,el_cde_tables,"[['As', 2.0], ['Ge', 1.0], ['S', 8.0], ['Sb', ...",∼2.4,Refractive index,
...,...,...,...,...,...,...,...,...,...,...,...,...
750,10.1016/S0167-2738(98)00091-5,12/10/2002,Solid State Ionics,STRUCTURALREFRACTOMETRYAL2O3TYPEALUMINATESGALL...,GaO1.5,0.015,,el_mylogic,"[['Ga', 1.0], ['O', 1.5]]","1.92, 1.95",n,
751,10.1016/j.solmat.2011.09.052,10/13/2011,Solar Energy Materials and Solar Cells,EFFECTPECVDSILICONOXYNITRIDEFILMCOMPOSITIONSUR...,SiON,0.000,,el_mylogic,"[['N', 1.0], ['O', 1.0], ['Si', 1.0]]",2.01,n,
752,10.1016/j.tsf.2015.08.062,9/11/2015,Thin Solid Films,EFFECTO2ARGASFLOWRATIOOPTICALPROPERTIESMECHANI...,HfO2,0.000,550 nm,el_mylogic,"[['Hf', 1.0], ['O', 2.0]]",1.96,Refractive index (at 550 nm),
753,10.1007/s00340-016-6537-2,,,,GaAs,0.000,,snowball,"[['As', 1.0], ['Ga', 1.0]]",3.52,n,


In [21]:
df_result = pd.concat([df_with_desc, df_nans.reset_index(drop=True)])

In [22]:
df_result.shape

(4663, 220)

In [23]:
df_result.to_csv('db_descriptors_RDKit.csv', index=False)