In [8]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from tqdm.auto import tqdm
import numpy as np

tqdm.pandas()

df = pd.read_csv(r"C:\proj\datacon\task1\prep.csv",index_col="Unnamed: 0")


df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_relation,standard_value,standard_units,standard_value_nM
5184,CHEMBL272433,CCSc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,=,8.333357e+08,nM,8.333357e+08
5186,CHEMBL270078,CS(=O)(=O)c1ccc(-c2nc(Cl)cc(NCc3cccs3)n2)cc1,=,8.333357e+08,nM,8.333357e+08
5229,CHEMBL271338,CS(=O)(=O)c1ccc(-c2nc(NCc3cccs3)cc(C(F)(F)F)n2...,=,4.762007e+08,nM,4.762007e+08
5319,CHEMBL346288,CS(=O)(=O)c1ccc(-c2sc3nncn3c2-c2ccccc2)cc1,=,4.466836e+08,nM,4.466836e+08
5173,CHEMBL271549,COCCOc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,=,4.166679e+08,nM,4.166679e+08
...,...,...,...,...,...,...
3033,CHEMBL501676,Clc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00
3031,CHEMBL51085,O=c1c2ccccc2[se]n1-c1ccccc1,=,0.000000e+00,nM,0.000000e+00
3035,CHEMBL513405,Cc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00
3034,CHEMBL471171,Cc1ccc(-c2[se]ncc2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00


In [9]:



# Функция для расчёта дескрипторов RDKit
def calculate_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # Получаем всех доступные дескрипторы
    desc_names = [desc[0] for desc in Descriptors._descList]
    # Рассчитываем значения
    desc_values = [desc[1](mol) for desc in Descriptors._descList]
    return dict(zip(desc_names, desc_values))

# Применяем функцию ко всем SMILES в датасете

descriptors_df = df['canonical_smiles'].progress_apply(calculate_rdkit_descriptors).apply(pd.Series)

# Объединяем исходные данные с новыми дескрипторами
df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

print(df_with_descriptors.shape)

df_with_descriptors

100%|██████████| 5100/5100 [00:42<00:00, 119.15it/s]


(5100, 223)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_relation,standard_value,standard_units,standard_value_nM,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
5184,CHEMBL272433,CCSc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,=,8.333357e+08,nM,8.333357e+08,11.628174,11.628174,0.288316,-3.216562,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5186,CHEMBL270078,CS(=O)(=O)c1ccc(-c2nc(Cl)cc(NCc3cccs3)n2)cc1,=,8.333357e+08,nM,8.333357e+08,11.525219,11.525219,0.254002,-3.229896,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5229,CHEMBL271338,CS(=O)(=O)c1ccc(-c2nc(NCc3cccs3)cc(C(F)(F)F)n2...,=,4.762007e+08,nM,4.762007e+08,13.202316,13.202316,0.035106,-4.635168,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5319,CHEMBL346288,CS(=O)(=O)c1ccc(-c2sc3nncn3c2-c2ccccc2)cc1,=,4.466836e+08,nM,4.466836e+08,11.648876,11.648876,0.315288,-3.201999,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5173,CHEMBL271549,COCCOc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,=,4.166679e+08,nM,4.166679e+08,11.661199,11.661199,0.249808,-3.257356,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,CHEMBL501676,Clc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00,5.932295,5.932295,0.127777,0.127777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3031,CHEMBL51085,O=c1c2ccccc2[se]n1-c1ccccc1,=,0.000000e+00,nM,0.000000e+00,12.138731,12.138731,0.078403,0.078403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3035,CHEMBL513405,Cc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00,5.946955,5.946955,0.160763,0.160763,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3034,CHEMBL471171,Cc1ccc(-c2[se]ncc2-c2ccc(Cl)cc2)cc1,=,0.000000e+00,nM,0.000000e+00,5.943368,5.943368,0.170861,0.170861,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:

from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator


fp_generator = GetMorganGenerator(radius=2, fpSize=2048)


def calculate_morgan_fp_new(smiles):
    
    mol = Chem.MolFromSmiles(smiles)
    fp = fp_generator.GetFingerprint(mol) 
    
    return list(fp)

fp_df = df['canonical_smiles'].progress_apply(calculate_morgan_fp_new).apply(pd.Series)


fp_df.columns = [f'morgan_fp_{i}' for i in range(len(fp_df.columns))]

# выглядит сомнительно
fp_df

100%|██████████| 5100/5100 [00:03<00:00, 1494.22it/s]


Unnamed: 0,morgan_fp_0,morgan_fp_1,morgan_fp_2,morgan_fp_3,morgan_fp_4,morgan_fp_5,morgan_fp_6,morgan_fp_7,morgan_fp_8,morgan_fp_9,...,morgan_fp_2038,morgan_fp_2039,morgan_fp_2040,morgan_fp_2041,morgan_fp_2042,morgan_fp_2043,morgan_fp_2044,morgan_fp_2045,morgan_fp_2046,morgan_fp_2047
5184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5319,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3031,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
fp_df.sum(numeric_only=True)

morgan_fp_0        16
morgan_fp_1       671
morgan_fp_2       185
morgan_fp_3        11
morgan_fp_4        27
                 ... 
morgan_fp_2043     42
morgan_fp_2044    101
morgan_fp_2045     80
morgan_fp_2046     11
morgan_fp_2047      5
Length: 2048, dtype: int64

In [20]:
df_with_descriptors.columns[:6]

Index(['molecule_chembl_id', 'canonical_smiles', 'standard_relation',
       'standard_value', 'standard_units', 'standard_value_nM'],
      dtype='object')

In [None]:
# Отбираем только колонки с дескрипторами для очистки
c_to_drop = df_with_descriptors.columns[:6]
descriptors_to_clean = df_with_descriptors.drop(columns=c_to_drop)

print(f"кол-во дескрипторов до: {descriptors_to_clean.shape[1]}")


initial_cols = descriptors_to_clean.shape[1]
descriptors_no_nan = descriptors_to_clean.dropna(axis=1)
print(f"Дескрипторов после удаления NaN: {descriptors_no_nan.shape[1]} (удалено {initial_cols - descriptors_no_nan.shape[1]})")


кол-во дескрипторов да: 217
Дескрипторов после удаления NaN: 205 (удалено 12)


In [27]:

# 4.2 Удаление колонок с нулевой дисперсией (константные признаки)
initial_cols = descriptors_no_nan.shape[1]
variances = descriptors_no_nan.var()
constant_cols = variances[variances == 0].index
descriptors_no_const = descriptors_no_nan.drop(columns=constant_cols)
print(f"удалено {initial_cols - descriptors_no_const.shape[1]}")

initial_cols = descriptors_no_const.shape[1]
corr_matrix = descriptors_no_const.corr().abs()

# Создаем верхнетреугольную матрицу, чтобы избежать дублирования пар
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Находим колонки с корреляцией более 7
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
descriptors_final = descriptors_no_const.drop(columns=to_drop)
print(f"Дескрипторов после удаления коррелирующих: {descriptors_final.shape[1]} (удалено {initial_cols - descriptors_final.shape[1]})")


удалено 13
Дескрипторов после удаления коррелирующих: 106 (удалено 86)


In [28]:


df_final_features = pd.concat([df[['molecule_chembl_id', 'canonical_smiles', 'standard_value_nM']], descriptors_final], axis=1)




df_final_features.to_csv('cock2.csv', index=False)

df_final_features

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value_nM,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,FpDensityMorgan1,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
5184,CHEMBL272433,CCSc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,8.333357e+08,11.628174,0.288316,-3.216562,0.465823,11.461538,405.570,1.153846,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5186,CHEMBL270078,CS(=O)(=O)c1ccc(-c2nc(Cl)cc(NCc3cccs3)n2)cc1,8.333357e+08,11.525219,0.254002,-3.229896,0.680998,11.416667,379.894,1.166667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5229,CHEMBL271338,CS(=O)(=O)c1ccc(-c2nc(NCc3cccs3)cc(C(F)(F)F)n2...,4.762007e+08,13.202316,0.035106,-4.635168,0.677475,12.148148,413.446,1.111111,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5319,CHEMBL346288,CS(=O)(=O)c1ccc(-c2sc3nncn3c2-c2ccccc2)cc1,4.466836e+08,11.648876,0.315288,-3.201999,0.563873,11.875000,355.444,0.958333,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5173,CHEMBL271549,COCCOc1cc(NCc2cccs2)nc(-c2ccc(S(C)(=O)=O)cc2)n1,4.166679e+08,11.661199,0.249808,-3.257356,0.532513,11.357143,419.528,1.107143,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,CHEMBL501676,Clc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,0.000000e+00,5.932295,0.127777,0.127777,0.604593,10.631579,353.110,0.736842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3031,CHEMBL51085,O=c1c2ccccc2[se]n1-c1ccccc1,0.000000e+00,12.138731,0.078403,0.078403,0.622431,10.750000,274.181,0.875000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3035,CHEMBL513405,Cc1ccc(-c2cn[se]c2-c2ccc(Cl)cc2)cc1,0.000000e+00,5.946955,0.160763,0.160763,0.632453,10.631579,332.692,0.894737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3034,CHEMBL471171,Cc1ccc(-c2[se]ncc2-c2ccc(Cl)cc2)cc1,0.000000e+00,5.943368,0.170861,0.170861,0.632453,10.631579,332.692,0.894737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
