In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp

In [2]:
# загружаем исходные данные
df = pd.read_csv('6_good_smiles.csv')
df.head(3)

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,specifier
0,10.1016/S0022-3093(99)00330-0,7/19/1999,Journal of Non-Crystalline Solids,PHOTOINDUCEDCHANGESINLINEARNONLINEAROPTICALPRO...,As20S60Ge20,0.0,,el_cde_tables,"[['As', 20.0], ['Ge', 20.0], ['S', 60.0]]",2.054,n
1,10.1016/j.fct.2006.05.017,6/7/2006,Food and Chemical Toxicology,GENERATIONFORMALDEHYDEINCIGARETTESOVERVIEWRECE...,Propionaldehyde,0.0,,el_cde_tables,CCC=O,3.74,n
2,10.1016/j.mee.2011.01.031,1/18/2011,Microelectronic Engineering,SINUSOIDALPLASMONICCRYSTALSFORBIODETECTIONSENSORS,PEO,0.0,,el_cde_text,OO,1.47,refractive index


In [3]:
# задаем свойства для выгрузки
properties = ['MolecularFormula', 'MolecularWeight','InChI', 'InChIKey', 'IUPACName', 
                                'XLogP', 'ExactMass', 'MonoisotopicMass', 'TPSA', 'Complexity', 'Charge', 
                                'HBondDonorCount', 'HBondAcceptorCount', 'RotatableBondCount', 
                                'HeavyAtomCount', 'IsotopeAtomCount', 'AtomStereoCount', 
                                'DefinedAtomStereoCount', 'UndefinedAtomStereoCount', 'BondStereoCount', 
                                'DefinedBondStereoCount', 'UndefinedBondStereoCount', 'CovalentUnitCount', 
                                'Volume3D', 'XStericQuadrupole3D', 'YStericQuadrupole3D', 
                                'ZStericQuadrupole3D', 'FeatureCount3D', 'FeatureAcceptorCount3D', 
                                'FeatureDonorCount3D', 'FeatureAnionCount3D', 'FeatureCationCount3D', 
                                'FeatureRingCount3D', 'FeatureHydrophobeCount3D', 'ConformerModelRMSD3D', 
                                'EffectiveRotorCount3D', 'ConformerCount3D']

In [None]:
# количество дескрипторов
len(properties)

In [4]:
# задаем пустой словарь для ненайденных смайлс
nans_dict = {}
for p in properties:
    nans_dict[p] = np.nan

In [5]:
# выгружаем дескрипторы
data = []

for i in df['normalised_name']:
    try:
        props = pcp.get_properties(properties, i, 'smiles')
        data.append(props)
    except:
        
        data.append([nans_dict])

In [None]:
# переводим данные в датафрейм
rows = []
columns = data[1][0].keys()
for i in range(len(df['normalised_name'])):
    rows.append(data[i][0].values())
props_df = pd.DataFrame(data=rows, columns=columns) 
props_df.shape

Unnamed: 0,CID,MolecularFormula,MolecularWeight,InChI,InChIKey,IUPACName,XLogP,ExactMass,MonoisotopicMass,TPSA,...,FeatureCount3D,FeatureAcceptorCount3D,FeatureDonorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureRingCount3D,FeatureHydrophobeCount3D,ConformerModelRMSD3D,EffectiveRotorCount3D,ConformerCount3D
0,,,,,,,,,,,...,,,,,,,,,,
1,527.0,C3H6O,58.08,"InChI=1S/C3H6O/c1-2-3-4/h3H,2H2,1H3",NBBJYMSMWIIQGU-UHFFFAOYSA-N,propanal,0.6,58.041864811,58.041864811,17.1,...,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.4,1.0,3.0
2,784.0,H2O2,34.015,InChI=1S/H2O2/c1-2/h1-2H,MHAJPDPJQMAIIY-UHFFFAOYSA-N,hydrogen peroxide,-0.9,34.005479302,34.005479302,40.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.0
3,5463166.0,AlH2,28.997,InChI=1S/Al.2H,AYENPGKIIRJZBT-UHFFFAOYSA-N,28.9971885,28.9971885,0.0,0.0,0.0,...,,,,,,,,,,
4,14769.0,Al2O3,101.961,InChI=1S/2Al.3O,TWNQGVIAIRXVLR-UHFFFAOYSA-N,oxo(oxoalumanyloxy)alumane,101.947821,101.947821,43.4,34.0,...,,,,,,,,,,
5,18330814.0,H4N3P3,138.974,"InChI=1S/H4N3P3/c1-4-2-6-3-5-1/h1,4-5H,(H,2,3)",NXDBMYTZSNQBEW-UHFFFAOYSA-N,"1,2,3,4-tetrahydro-1,3,5,2,4,6-triazatriphosph...",0.0,138.96180813,138.96180813,36.4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4,0.0,2.0
6,11039.0,C5H10O2,102.13,"InChI=1S/C5H10O2/c1-4(2)5(6)7-3/h4H,1-3H3",BHIWKHZACMWKOJ-UHFFFAOYSA-N,methyl 2-methylpropanoate,1.2,102.068079557,102.068079557,26.3,...,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.4,2.0,3.0
7,14769.0,Al2O3,101.961,InChI=1S/2Al.3O,TWNQGVIAIRXVLR-UHFFFAOYSA-N,oxo(oxoalumanyloxy)alumane,101.947821,101.947821,43.4,34.0,...,,,,,,,,,,
8,4694097.0,CH3O4S-,111.1,"InChI=1S/CH4O4S/c1-5-6(2,3)4/h1H3,(H,2,3,4)/p-1",JZMJDSHXVKJFKW-UHFFFAOYSA-M,methyl sulfate,-0.9,110.97520475,110.97520475,74.8,...,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.4,1.0,1.0
9,23953.0,H4Si,32.117,InChI=1S/H4Si/h1H4,BLRPTPMANUNPDV-UHFFFAOYSA-N,silane,32.008226662,32.008226662,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.0,


In [None]:
# объединяем датафреймы
df_result = pd.concat([df, props_df], axis=1)
df_result.shape

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,...,FeatureCount3D,FeatureAcceptorCount3D,FeatureDonorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureRingCount3D,FeatureHydrophobeCount3D,ConformerModelRMSD3D,EffectiveRotorCount3D,ConformerCount3D
0,10.1016/S0022-3093(99)00330-0,7/19/1999,Journal of Non-Crystalline Solids,PHOTOINDUCEDCHANGESINLINEARNONLINEAROPTICALPRO...,As20S60Ge20,0.0,,el_cde_tables,"[['As', 20.0], ['Ge', 20.0], ['S', 60.0]]",2.054,...,,,,,,,,,,
1,10.1016/j.fct.2006.05.017,6/7/2006,Food and Chemical Toxicology,GENERATIONFORMALDEHYDEINCIGARETTESOVERVIEWRECE...,Propionaldehyde,0.0,,el_cde_tables,CCC=O,3.74,...,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.4,1.0,3.0
2,10.1016/j.mee.2011.01.031,1/18/2011,Microelectronic Engineering,SINUSOIDALPLASMONICCRYSTALSFORBIODETECTIONSENSORS,PEO,0.0,,el_cde_text,OO,1.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.0
3,10.1016/j.apradiso.2012.04.026,5/2/2012,Applied Radiation and Isotopes,BENCHMARKINGGEANT4FULLSYSTEMSIMULATIONASSOCIAT...,Aluminum,0.0,,el_cde_tables,[AlH2],4.4,...,,,,,,,,,,
4,10.1016/S0924-4247(02)00264-9,9/16/2002,Sensors and Actuators A: Physical,INFLUENCESMATERIALPROPERTIESCERAMICMICROSTEREO...,Alumina,0.0,,el_mylogic,O=[Al]O[Al]=O,1.7,...,,,,,,,,,,
5,10.1039/B926069B,5/25/2010,Dalton Transactions,Synthesis and optical properties of sulfur -co...,cyclotriphosphazene,0.0,,rsc_cde_text,N1PNP=NP1,1.9,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4,0.0,2.0
6,10.1016/j.snb.2010.10.036,10/30/2010,Sensors and Actuators B: Chemical,EMISSIONBASEDSUBNANOMOLARSILVERSENSINGELECTROS...,PMMA,0.0,1 μm,el_mylogic,COC(=O)[C](C)C,1.4893,...,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.4,2.0,3.0
7,10.1016/j.optmat.2016.01.060,2/4/2016,Optical Materials,STUDIESALUMINUMOXIDETHINFILMSDEPOSITEDBYLASERA...,Al2O3,0.0,,el_mylogic,O=[Al]O[Al]=O,1.7685,...,,,,,,,,,,
8,10.1016/j.jct.2010.10.024,11/5/2010,The Journal of Chemical Thermodynamics,THERMOPHYSICALPROPERTIESTWOIONICLIQUIDSBASEDBE...,CH3SO4,0.0,,el_cde_tables,COS(=O)(=O)[O-],1.52744,...,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.4,1.0,1.0
9,10.1016/S0379-6779(99)00318-5,6/6/2000,Synthetic Metals,ELLIPSOMETRICATOMICFORCEMICROSCOPICINVESTIGATI...,Silicon,0.0,,el_cde_tables,[SiH4],1.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.0,


In [None]:
df_result.to_csv('db_descriptors_PubChem.csv', index=False)