In [None]:
!pip install rdkit catboost
!python -m pip install git+https://github.com/EBjerrum/molvecgen

In [None]:
!pip install git+https://github.com/samoturk/mol2vec

In [130]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import cross_val_score
from molvecgen.vectorizers import SmilesVectorizer

In [131]:
import numpy as np
import pandas as pd
import sys, os

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd()))+'/mol2vec')

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole


In [132]:
df = pd.read_excel('1400.xlsx')
df = df.drop(columns = 'Pictures')
df['SI'] = df['CC50-MDCK, mmg/ml']/df['IC50, mmg/ml']
df['S_leng'] = df['SMILES'].str.len()

In [133]:
df['finger'] = df['SMILES'].apply(lambda x: AllChem.RDKFingerprint(Chem.MolFromSmiles(x)))
duplicates = df[df.duplicated(subset='Title', keep='first')]

for i in range(len(duplicates)):
  distances = []
  fps_1 = duplicates.iloc[i]['finger']
  for j in range(len(df)):
    fps_2 = df.iloc[j]['finger']
    if fps_1 != fps_2:
      dist = DataStructs.FingerprintSimilarity(fps_1, fps_2)
      distances.append(dist)
  duplicates.loc[duplicates.index[i], 'IC50, mmg/ml'] = df.iloc[np.argmax(distances)]['IC50, mmg/ml']
  duplicates.loc[duplicates.index[i], 'CC50-MDCK, mmg/ml'] = df.iloc[np.argmax(distances)]['CC50-MDCK, mmg/ml']
  duplicates.loc[duplicates.index[i], 'SI'] = df.iloc[np.argmax(distances)]['SI']
df.drop_duplicates(subset='Title', keep=False, inplace=True)
df = pd.concat([df, duplicates]).drop('finger', axis=1)

[09:00:16] Conflicting single bond directions around double bond at index 55.
[09:00:16]   BondStereo set to STEREONONE and single bond directions set to NONE.


In [134]:
# Фильтрация DataFrame
df = df[(df['SI'] < 100) & (df['Polar SA'] < 300) & (df['Molecular weight'] < 1000) & (df['S_leng'] < 200)]

In [None]:
def RDKfingerPrint(mol_smi, **kwargs):
    mol = Chem.MolFromSmiles(mol_smi)
    desc_vec = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(AllChem.RDKFingerprint(mol, **kwargs), desc_vec)
    return desc_vec

# Применить функцию RDKfingerPrint ко всем строкам в столбце 'SMILES' и создать новые столбцы
df[['RDKFP_' + str(i) for i in range(2048)]] = df['SMILES'].apply(lambda x: pd.Series(RDKfingerPrint(x, maxPath=5)))


In [170]:
y_CC50 = np.array(df['CC50-MDCK, mmg/ml'])
scaler = StandardScaler()
y_CC50_scaled = scaler.fit_transform(y_CC50.reshape(-1,1))

y_IC50 = np.array(df['IC50, mmg/ml'])
scaler = StandardScaler()
y_IC50_scaled = scaler.fit_transform(y_IC50.reshape(-1,1))

y_SI = np.array(df['SI'])
scaler = StandardScaler()
y_SI_scaled = scaler.fit_transform(y_SI.reshape(-1,1))

In [172]:
def solve(df, y1, y2):
  def RDKfingerPrint(mol_smi, **kwargs):
    mol = Chem.MolFromSmiles(mol_smi)
    desc_vec = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(AllChem.RDKFingerprint(mol, **kwargs), desc_vec)
    return desc_vec

  # Применить функцию RDKfingerPrint ко всем строкам в столбце 'SMILES' и создать новые столбцы
  df[['RDKFP_' + str(i) for i in range(2048)]] = df['SMILES'].apply(lambda x: pd.Series(RDKfingerPrint(x, maxPath=5)))
  result_df = df.drop(columns = ['IC50, mmg/ml', 'CC50-MDCK, mmg/ml', 'SI', 'Title', 'SMILES'])

  result_df = result_df.reset_index(drop = True)
  def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols)

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
  descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
                "NHOHCount": Descriptors.NHOHCount,
                "NOCount": Descriptors.NOCount,
                "NumHAcceptors": Descriptors.NumHAcceptors,
                "NumHDonors": Descriptors.NumHDonors,
                "NumHeteroatoms": Descriptors.NumHeteroatoms,
                "NumRotatableBonds": Descriptors.NumRotatableBonds,
                "NumValenceElectrons": Descriptors.NumValenceElectrons,
                "NumAromaticRings": Descriptors.NumAromaticRings,
                "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
                "RingCount": Descriptors.RingCount,
                "MW": Descriptors.MolWt,
                "LogP": Descriptors.MolLogP,
                "MR": Descriptors.MolMR,
                "TPSA": Descriptors.TPSA}

  # sklearn трансформер для использования в конвейерном моделировании
  descriptors_transformer = FunctionTransformer(mol_dsc_calc)
  X = descriptors_transformer.transform(df['SMILES'])
  result_df = df.join(X)
  result_df = result_df.drop(columns = ['Title', 'SMILES', 'IC50, mmg/ml','CC50-MDCK, mmg/ml', 'SI'])
  result_df = result_df.reset_index(drop = True)
  cat_model = CatBoostRegressor(verbose = 500,  random_state = 42)
  cat_model.fit(result_df,y)
  feature_importance_df = pd.DataFrame({'Feature': result_df.columns, 'Importance': cat_model.feature_importances_})

  feature_importance_df = feature_importance_df[feature_importance_df['Importance'] > 0]

  selected_features = feature_importance_df['Feature']

  result_df = result_df[selected_features]
  def remove_highly_correlated_features(df, threshold=0.85):
    correlation_matrix = df.corr()
    highly_correlated_features = set()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                feature1 = correlation_matrix.columns[i]
                feature2 = correlation_matrix.columns[j]
                highly_correlated_features.add(feature1)  # Добавляем один из признаков в множество

    # Удалить выбранные признаки из DataFrame
    df = df.drop(columns=highly_correlated_features)

    return df

# Применить функцию к разным DataFrame
  result_df = remove_highly_correlated_features(result_df)
  from sklearn.model_selection import ShuffleSplit
  cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 228)
  scores1 = cross_val_score(cat_model, result_df, y1, cv=cv, scoring='neg_root_mean_squared_error')
  scores2 = cross_val_score(cat_model, result_df, y2, cv=cv, scoring='neg_root_mean_squared_error')
  return scores1, scores2

In [169]:
scores_CC50, scoresCC50_scaled = solve(df,y_CC50, y_CC50_scaled)
scores_CC50.mean(), scoresCC50_scaled.mean()

Learning rate set to 0.042466
0:	learn: 0.9919692	total: 119ms	remaining: 1m 59s
500:	learn: 0.3803430	total: 1m 10s	remaining: 1m 10s
999:	learn: 0.2322575	total: 2m 19s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 144.0037740	total: 91.9ms	remaining: 1m 31s
500:	learn: 51.1651879	total: 29.2s	remaining: 29.1s
999:	learn: 30.4072009	total: 57.7s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 142.5781008	total: 51.3ms	remaining: 51.3s
500:	learn: 50.7458185	total: 28.5s	remaining: 28.4s
999:	learn: 30.3235084	total: 57.1s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 142.7235971	total: 54.3ms	remaining: 54.2s
500:	learn: 53.1909397	total: 28.6s	remaining: 28.5s
999:	learn: 32.4530917	total: 57.9s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 145.1394632	total: 55.5ms	remaining: 55.4s
500:	learn: 52.3517391	total: 28.8s	remaining: 28.6s
999:	learn: 31.1263439	total: 57.4s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 141.6227813	total: 54.8ms

(-103.11553121384898, -0.7201191539878306)

In [173]:
scores_IC50, scoresIC50_scaled = solve(df,y_IC50, y_IC50_scaled)
scores_IC50.mean(), scoresIC50_scaled.mean()

Learning rate set to 0.042466
0:	learn: 0.9919692	total: 132ms	remaining: 2m 11s
500:	learn: 0.3803430	total: 1m 9s	remaining: 1m 9s
999:	learn: 0.2322575	total: 2m 19s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 97.8970409	total: 50.9ms	remaining: 50.9s
500:	learn: 37.8306716	total: 28.8s	remaining: 28.7s
999:	learn: 23.2015508	total: 57.5s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 99.3829562	total: 58.1ms	remaining: 58s
500:	learn: 39.8265364	total: 29.4s	remaining: 29.3s
999:	learn: 24.6236772	total: 58.8s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 98.0433862	total: 57.5ms	remaining: 57.4s
500:	learn: 37.5828097	total: 28.6s	remaining: 28.5s
999:	learn: 22.4710308	total: 57.1s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 1.0145343	total: 54ms	remaining: 54s
500:	learn: 0.3920498	total: 28.8s	remaining: 28.7s
999:	learn: 0.2404441	total: 57.9s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 1.0299333	total: 85.4ms	remaining: 1m 25s

(-84.5745036195923, -0.8764691462568369)

In [168]:
scores_SI, scoresSI_scaled = solve(df,y_SI, y_SI_scaled)
scores_SI.mean(), scoresSI_scaled.mean()

Learning rate set to 0.042466
0:	learn: 0.9919692	total: 112ms	remaining: 1m 52s
500:	learn: 0.3803430	total: 1m 10s	remaining: 1m 9s
999:	learn: 0.2322575	total: 2m 20s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 18.0767124	total: 50.8ms	remaining: 50.8s
500:	learn: 8.8898635	total: 28.6s	remaining: 28.5s
999:	learn: 5.6747540	total: 57.2s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 17.7625937	total: 68.7ms	remaining: 1m 8s
500:	learn: 8.3388766	total: 29.8s	remaining: 29.6s
999:	learn: 5.4303195	total: 58.2s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 17.8401025	total: 57ms	remaining: 57s
500:	learn: 8.0897656	total: 28.5s	remaining: 28.4s
999:	learn: 5.0789637	total: 58.3s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 17.9454290	total: 54.7ms	remaining: 54.6s
500:	learn: 8.1305754	total: 30.4s	remaining: 30.3s
999:	learn: 5.0739962	total: 1m 1s	remaining: 0us
Learning rate set to 0.040995
0:	learn: 17.7497803	total: 56.9ms	remaining: 56.8s
5

(-16.823314014191244, -0.9339519091118176)

FINAL SCORES(RMSE):
CC50 - 103.11, 0.72 (стандартизованный)
IC50 - 84.574, 0.876(стандартизированный)
SI - 16.82, 0.93(стандартизованный)
