In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp

In [2]:
# загружаем исходные данные
df = pd.read_csv('6_good_smiles.csv')
df.head(3)

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/S0022-3093(99)00330-0,7/19/1999,Journal of Non-Crystalline Solids,PHOTOINDUCEDCHANGESINLINEARNONLINEAROPTICALPRO...,As20S60Ge20,0.0,,el_cde_tables,"[['As', 20.0], ['Ge', 20.0], ['S', 60.0]]",2.054,n
1,10.1016/j.fct.2006.05.017,6/7/2006,Food and Chemical Toxicology,GENERATIONFORMALDEHYDEINCIGARETTESOVERVIEWRECE...,Propionaldehyde,0.0,,el_cde_tables,CCC=O,3.74,n
2,10.1016/j.mee.2011.01.031,1/18/2011,Microelectronic Engineering,SINUSOIDALPLASMONICCRYSTALSFORBIODETECTIONSENSORS,PEO,0.0,,el_cde_text,OO,1.47,refractive index


In [3]:
# задаем свойства для выгрузки
properties = ['MolecularFormula', 'MolecularWeight','InChI', 'InChIKey', 'IUPACName', 
                                'XLogP', 'ExactMass', 'MonoisotopicMass', 'TPSA', 'Complexity', 'Charge', 
                                'HBondDonorCount', 'HBondAcceptorCount', 'RotatableBondCount', 
                                'HeavyAtomCount', 'IsotopeAtomCount', 'AtomStereoCount', 
                                'DefinedAtomStereoCount', 'UndefinedAtomStereoCount', 'BondStereoCount', 
                                'DefinedBondStereoCount', 'UndefinedBondStereoCount', 'CovalentUnitCount', 
                                'Volume3D', 'XStericQuadrupole3D', 'YStericQuadrupole3D', 
                                'ZStericQuadrupole3D', 'FeatureCount3D', 'FeatureAcceptorCount3D', 
                                'FeatureDonorCount3D', 'FeatureAnionCount3D', 'FeatureCationCount3D', 
                                'FeatureRingCount3D', 'FeatureHydrophobeCount3D', 'ConformerModelRMSD3D', 
                                'EffectiveRotorCount3D', 'ConformerCount3D']

In [9]:
# количество дескрипторов
len(properties)

37

In [4]:
# задаем пустой словарь для ненайденных смайлс
nans_dict = {}
for p in properties:
    nans_dict[p] = np.nan

In [5]:
# выгружаем дескрипторы
data = []

for i in df['normalised_name']:
    try:
        props = pcp.get_properties(properties, i, 'smiles')
        data.append(props)
    except:
        
        data.append([nans_dict])

In [6]:
# переводим данные в датафрейм
rows = []
columns = data[1][0].keys()
for i in range(len(df['normalised_name'])):
    rows.append(data[i][0].values())
props_df = pd.DataFrame(data=rows, columns=columns) 
props_df.shape

(4663, 38)

In [7]:
# объединяем датафреймы
df_result = pd.concat([df, props_df], axis=1)
df_result.shape

(4663, 49)

In [8]:
df_result.to_csv('db_descriptors_PubChem.csv', index=False)