In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

In [None]:
data = pd.read_excel('/content/All-1614.xlsx')
data.head()

Unnamed: 0,Title,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,Molecular weight,Hydrogen bond acceptors,Hydrogen bond donors,Polar SA,SMILES,Pictures
0,1007-Ya-213,2.7,500.0,185.185185,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,50.0
1,1007-Ya-213,0.7,447.0,638.571429,195.307,2,1,32.59,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,51.0
2,1008-Ya-187,9.9,144.0,14.545455,250.431,1,0,15.6,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,52.0
3,1009-As-106,8.3,500.0,60.240964,222.377,1,0,15.6,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,53.0
4,1010-Ya-208,39.4,143.0,3.629442,239.361,2,0,29.54,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,54.0


In [None]:
data.columns

Index(['Title', 'IC50, mmg/ml', 'CC50-MDCK, mmg/ml', 'SI', 'Molecular weight',
       'Hydrogen bond acceptors', 'Hydrogen bond donors', 'Polar SA', 'SMILES',
       'Pictures'],
      dtype='object')

In [None]:
data = data[['Title','SMILES', 'IC50, mmg/ml', 'CC50-MDCK, mmg/ml', 'SI']]


In [None]:
data = pd.read_csv('/content/test_data.csv')
data.head()

Unnamed: 0,id,smiles
0,OV-80,CC1(C)[C@@H]2CC[C@@]1(C)\C(C2)=N/NC(=O)CN1CCCCC1
1,OV-81,CC1(C)[C@@H]2CC[C@@]1(C)\C(C2)=N/NC(=O)CN1CCC(...
2,OV-82,CC1(C)[C@@H]2CC[C@@]1(C)\C(C2)=N/NC(=O)CN1CCOCC1
3,OV-84,CC1CCN(CC(=O)N\N=C2\C[C@H]3CC[C@]2(C)C3(C)C)CC1
4,OV-85,CC1CCCN(CC(=O)N\N=C2\C[C@H]3CC[C@]2(C)C3(C)C)C1


In [None]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)
X = descriptors_transformer.transform(data['SMILES'])
X.head()

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
1,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
2,18,0,2,2,0,2,5,104,0,0,2,250.43,3.6154,79.319,15.6
3,16,0,2,2,0,2,3,92,0,0,2,222.376,2.8352,70.085,15.6
4,17,0,3,3,0,3,3,98,0,0,2,239.359,2.306,67.663,29.54


In [None]:
data_dsc = data.join(X)

In [None]:
data_dsc.head()

Unnamed: 0,Title,SMILES,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,2.7,500.0,185.185185,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
1,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,0.7,447.0,638.571429,14,1,2,2,1,2,2,80,0,0,2,195.306,2.2659,58.6168,32.59
2,1008-Ya-187,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,9.9,144.0,14.545455,18,0,2,2,0,2,5,104,0,0,2,250.43,3.6154,79.319,15.6
3,1009-As-106,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,8.3,500.0,60.240964,16,0,2,2,0,2,3,92,0,0,2,222.376,2.8352,70.085,15.6
4,1010-Ya-208,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,39.4,143.0,3.629442,17,0,3,3,0,3,3,98,0,0,2,239.359,2.306,67.663,29.54


In [None]:
data_dsc.to_excel('data_dsc.xlsx')

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem


def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [None]:
Y = rdkit_fp(data['SMILES'])
Y.head()

Unnamed: 0,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,bit_id_7,bit_id_8,bit_id_9,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_fp = data.join(Y)
data_fp.head()

Unnamed: 0,Title,SMILES,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,2.7,500.0,185.185185,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,0.7,447.0,638.571429,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1008-Ya-187,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,9.9,144.0,14.545455,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1009-As-106,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,8.3,500.0,60.240964,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1010-Ya-208,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,39.4,143.0,3.629442,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_fp.to_excel('data_fp.xlsx')

In [None]:
Z = rdkit_2d(data['SMILES'])
Z.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.803737,8.803737,0.185194,0.185194,0.71998,42.285714,195.306,174.138,195.162314,80,...,0,0,0,0,0,0,0,0,0,0
1,8.803737,8.803737,0.185194,0.185194,0.71998,42.285714,195.306,174.138,195.162314,80,...,0,0,0,0,0,0,0,0,0,0
2,4.983958,4.983958,0.389807,0.389807,0.727487,35.888889,250.43,220.19,250.240899,104,...,0,0,0,0,0,0,0,0,0,0
3,4.888403,4.888403,0.392837,0.392837,0.716719,38.875,222.376,196.168,222.209599,92,...,0,0,0,0,0,0,0,0,0,0
4,11.775123,11.775123,0.079097,-0.079097,0.707806,38.705882,239.359,214.159,239.188529,98,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_2d = data.join(Z)
data_2d.head()

Unnamed: 0,Title,SMILES,"IC50, mmg/ml","CC50-MDCK, mmg/ml",SI,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,2.7,500.0,185.185185,8.803737,8.803737,0.185194,0.185194,0.71998,...,0,0,0,0,0,0,0,0,0,0
1,1007-Ya-213,OCC\N=C(\[C@]12C)C[C@@H](C1(C)C)CC2,0.7,447.0,638.571429,8.803737,8.803737,0.185194,0.185194,0.71998,...,0,0,0,0,0,0,0,0,0,0
2,1008-Ya-187,CCN(CC)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,9.9,144.0,14.545455,4.983958,4.983958,0.389807,0.389807,0.727487,...,0,0,0,0,0,0,0,0,0,0
3,1009-As-106,CN(C)CC\N=C(\[C@@]12C)C[C@H](C1(C)C)CC2,8.3,500.0,60.240964,4.888403,4.888403,0.392837,0.392837,0.716719,...,0,0,0,0,0,0,0,0,0,0
4,1010-Ya-208,CN(C)CC(=O)O[C@H]1C[C@H](CC2)C(C)(C)[C@@]12C,39.4,143.0,3.629442,11.775123,11.775123,0.079097,-0.079097,0.707806,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data_2d.to_excel('data_2d.xlsx')