План:
1. Для набора данных получить дескрипторов из источников: RDKit, Pubchem, Mordred и т. д. 
2. Выбрать признаки из загруженных.


# Подгружаем дескрипторы

In [1]:
!pip install mordred

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors
import numpy as np

In [3]:
# Прочитать датасет
dataset = pd.read_csv('sampled_dataset.csv')



In [4]:
dataset

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv
0,gdb_129252,N=C1OCC2=C1ON=C2,3.42254,1.67692,1.13352,4.5643,67.77,-0.2620,-0.0572,0.2048,972.8206,0.091971,-452.735916,-452.729598,-452.728654,-452.766782,24.110
1,gdb_46207,O=C1CCC11C2CCN12,3.00351,1.58602,1.24538,1.9055,77.52,-0.2402,-0.0312,0.2091,1039.1092,0.147846,-401.909060,-401.901322,-401.900377,-401.941536,29.799
2,gdb_50593,O=CC1CCC2OCC12,3.11416,1.26756,1.03765,1.6498,75.15,-0.2483,-0.0267,0.2215,1166.1361,0.160480,-423.030316,-423.022409,-423.021465,-423.063428,29.829
3,gdb_38546,C1C2C3N=C4OC1C4C23,3.12126,2.32028,1.94917,3.6237,69.65,-0.2277,-0.0211,0.2066,763.5215,0.127690,-400.695008,-400.689538,-400.688594,-400.724434,24.179
4,gdb_62820,CC1(O)C2CC1(O2)C#C,2.37047,1.54170,1.35971,1.7895,76.02,-0.2495,0.0074,0.2569,1039.3059,0.133262,-421.725719,-421.717304,-421.716360,-421.758310,33.112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_25229,c1coc2c1C(=O)CC2,3.17605,1.63047,1.09187,3.0955,73.12,-0.2383,-0.0256,0.2127,1033.0148,0.115102,-420.687271,-420.680504,-420.679560,-420.718717,26.087
19996,gdb_13572,CC1CN1C(=N)C#N,3.12217,1.77668,1.34512,3.1791,69.13,-0.2612,-0.0537,0.2075,927.9641,0.118744,-358.830448,-358.822497,-358.821553,-358.863014,28.421
19997,gdb_112381,COC1CC(=N)OC1=O,3.40512,1.21702,0.92748,2.1467,67.72,-0.2631,-0.0214,0.2416,1215.9934,0.123752,-475.063357,-475.054945,-475.054001,-475.097078,30.166
19998,gdb_14107,OC1CC11OCC1O,3.22232,1.74690,1.53428,1.4181,60.90,-0.2393,0.0544,0.2937,882.3828,0.129128,-420.823950,-420.816206,-420.815262,-420.855963,29.183


# юзаем mordred и rdkit

In [5]:
# Создание DataFrame из столбца "smiles"
df = dataset[['smiles']]

# Добавление дескрипторов с помощью RDKit
descriptors_rdkit = []
for smile in df['smiles']:
    mol = Chem.MolFromSmiles(smile)
    if mol is not None:
        descriptor_values = [d(mol) for _, d in Descriptors._descList]
        descriptors_rdkit.append(descriptor_values)
    else:
        descriptors_rdkit.append([None] * len(Descriptors._descList))

descriptor_names = [x[0] for x in Descriptors._descList]
df_rdkit = pd.DataFrame(descriptors_rdkit, columns=descriptor_names)

# Добавление данных Mordred
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(smi) for smi in df.smiles]
df_mordred = calc.pandas(mols)

# Объединение исходного DataFrame с новыми данными
df = pd.concat([df, df_rdkit, df_mordred], axis=1)

# Объединение с исходным датасетом по столбцу "smiles"
df = pd.merge(dataset, df, on='smiles', how='inner')

# В итоговом DataFrame df сохранятся все столбцы из исходного датасета dataset

# 1840 столбцов звучит хайпово

In [7]:
df

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,gdb_129252,N=C1OCC2=C1ON=C2,3.42254,1.67692,1.13352,4.5643,67.77,-0.2620,-0.0572,0.2048,...,9.030137,54.421098,124.027277,9.540560,77,9,48.0,57.0,2.583333,1.972222
1,gdb_46207,O=C1CCC11C2CCN12,3.00351,1.58602,1.24538,1.9055,77.52,-0.2402,-0.0312,0.2091,...,10.358473,64.099976,123.068414,6.837134,78,9,60.0,82.0,2.395833,1.819444
2,gdb_50593,O=CC1CCC2OCC12,3.11416,1.26756,1.03765,1.6498,75.15,-0.2483,-0.0267,0.2215,...,9.399389,53.862370,126.068080,6.635162,82,9,48.0,58.0,2.583333,2.055556
3,gdb_38546,C1C2C3N=C4OC1C4C23,3.12126,2.32028,1.94917,3.6237,69.65,-0.2277,-0.0211,0.2066,...,10.316094,64.041419,121.052764,7.565798,68,8,66.0,90.0,1.416667,1.666667
4,gdb_62820,CC1(O)C2CC1(O2)C#C,2.37047,1.54170,1.35971,1.7895,76.02,-0.2495,0.0074,0.2569,...,10.682377,42.876740,124.052429,7.297202,78,10,56.0,74.0,3.986111,1.854167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999,gdb_25229,c1coc2c1C(=O)CC2,3.17605,1.63047,1.09187,3.0955,73.12,-0.2383,-0.0256,0.2127,...,9.030137,54.421098,122.036779,8.135785,77,9,48.0,57.0,2.583333,1.972222
20000,gdb_13572,CC1CN1C(=N)C#N,3.12217,1.77668,1.34512,3.1791,69.13,-0.2612,-0.0537,0.2075,...,8.885579,56.024874,109.063997,7.270933,66,7,38.0,44.0,3.833333,1.888889
20001,gdb_112381,COC1CC(=N)OC1=O,3.40512,1.21702,0.92748,2.1467,67.72,-0.2631,-0.0214,0.2416,...,8.720787,51.566550,129.042593,8.065162,85,9,42.0,47.0,4.083333,2.111111
20002,gdb_14107,OC1CC11OCC1O,3.22232,1.74690,1.53428,1.4181,60.90,-0.2393,0.0544,0.2937,...,9.889845,60.114861,116.047344,7.252959,58,7,48.0,62.0,3.034722,1.666667


In [8]:
# Сохранение в CSV-файл
df.to_csv('dataset_with_descriptors1.csv', index=False)


## Вроде всё гуд