**Хемоинформатика**

Предсказание противовирусной активности соединений - очевидная актуальная задача, которая позволит ускорить создание лекарств, используя современные цифровые инструменты. В рамках задачи необходимо собрать информацию о различных химических соединениях, для которых активность простив одного из вирусов (A/H1N1, SARS-CoV-2, HIV-1) известна, а затем обучить модель для предсказания противовирусной активности. Для сбора подходит, например, база.


Dataset: CHEMBL4303835

Dataset: CHEMBL4523582



In [None]:
!pip install rdkit
!pip install grakel
!pip install tensorflow scikit-learn xgboost dgl torch torchvision
!pip install dgl
!pip install catboost

# импорты библиотек и загрузка файлов


In [56]:
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import tensorflow as tf
from tensorflow.keras import layers, models
import torch
import torch.nn as nn
import torch.optim as optim
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, Draw
from rdkit.Chem.Draw import IPythonConsole
from sklearn.utils import shuffle


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_act1_virus = pd.read_csv('/content/drive/MyDrive/hac/data_act1.csv',sep=';')
data_act2_virus = pd.read_csv('/content/drive/MyDrive/hac/data_act2.csv',sep=';')

  data_act1_virus = pd.read_csv('/content/drive/MyDrive/hac/data_act1.csv',sep=';')


# Обработка данных

In [None]:
def train_procesing_for_cells(data_act):
  data_act.rename(columns={'Molecule ChEMBL ID':'ChEMBL ID'},inplace=True)
  data_act= data_act[data_act["Standard Type"] == "IC50"]
  data_act = data_act.dropna(subset=['Standard Value'])
  result = data_act.loc[data_act.groupby('ChEMBL ID')['Standard Value'].idxmax()]
  result = result[["Standard Value", "ChEMBL ID", "Smiles"]]
  return result

In [None]:
def join_tables_without_duplicates(table1, table2, column_name):
  merged_df = pd.concat([table1, table2])
  merged_df = merged_df.drop_duplicates(subset=[column_name], keep='first')
  return merged_df


In [None]:
result1 = train_procesing_for_cells(data_act1_virus)
result2 = train_procesing_for_cells(data_act2_virus)

result_for_target_virus = join_tables_without_duplicates(result1,result2,"ChEMBL ID")
result_for_target_virus.reset_index(inplace=True, drop=True)

In [None]:
data = result_for_target_virus
data=data.dropna()
data

In [None]:
def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])

def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [None]:
Y = rdkit_fp(data['Smiles'])
Z = rdkit_2d(data['Smiles'])

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m


In [None]:
data = data.join(Y)
data = data.join(Z)

In [None]:
data = shuffle(data)
data.reset_index(inplace=True, drop=True)
data=data.dropna()
y= data['Standard Value']
X = data.drop(['Smiles', 'Standard Value', "ChEMBL ID"], axis=1)


In [None]:
#стандартизация X признаков
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X — это матрица признаков
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
joblib.dump(scaler, 'scaler_virus.pkl')
X_scaled_df.columns

Index(['bit_id_0', 'bit_id_1', 'bit_id_2', 'bit_id_3', 'bit_id_4', 'bit_id_5',
       'bit_id_6', 'bit_id_7', 'bit_id_8', 'bit_id_9',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=2258)

In [None]:
# Удаление признаков с низкой дисперсией менее 1%
selector = VarianceThreshold(threshold=0.01)
X_var_thresh = selector.fit_transform(X_scaled)
joblib.dump(selector, 'selector_virus.pkl')

['selector_virus.pkl']

In [None]:
# Вычислим корреляционную матрицу
column_names = [f'feature_{i}' for i in range(X_var_thresh.shape[1])]
corr_matrix = np.corrcoef(X_var_thresh, rowvar=False)
upper_triangle = np.triu(np.abs(corr_matrix), k=1)
to_drop = [i for i in range(upper_triangle.shape[1]) if any(upper_triangle[:, i] > 0.95)]
removed_columns = [column_names[i] for i in to_drop]
X_uncorr = np.delete(X_var_thresh, to_drop, axis=1)


In [None]:
removed_columns

In [None]:
# Сохраним 95% дисперсии данных
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_uncorr)
joblib.dump(pca, 'pca_model_virus.pkl')

['pca_model_virus.pkl']

In [None]:
# Если распределение целевой переменной сильно скошено, можно использовать логарифм
y_log = np.log(y + 1e-6)
y_log

Unnamed: 0,Standard Value
0,10.579489
1,11.512925
2,8.101678
3,1.386295
4,9.903488
...,...
5742,3.637586
5743,9.903488
5744,10.126631
5745,10.762149


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_log, test_size=0.15, random_state=42)

# Обучение

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(
    learning_rate=0.05,
    max_depth=8,
    n_estimators=850,
    random_state=42
)

# 4. Обучение модели
xgb.fit(X_train, y_train)

# 5. Прогнозирование на тестовых данных
y_pred_xgb = xgb.predict(X_test)

# 6. Оценка модели
print("XGBoost Metrics:")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred_xgb):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred_xgb):.4f}")

XGBoost Metrics:
Mean Squared Error: 2.2350
Mean Absolute Error: 0.7603


Пробуемые модели

In [None]:
param_grid = {
    'iterations': [1000],  # Два значения для количества итераций
    'depth': [8],  # Два значения для глубины деревьев
    'learning_rate': [0.05],  # Два значения для скорости обучения
}

grid_search = GridSearchCV(estimator=CatBoostRegressor(verbose=0), param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшая модель
best_model = grid_search.best_estimator_
print("Лучшие параметры модели CatBoost:", grid_search.best_params_)

# 8. Прогнозирование с лучшей моделью
y_pred_best = best_model.predict(X_test)

# Оценка лучшей модели
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f'\nЛучшие метрики модели:')
print(f'MSE: {mse_best:.4f}')
print(f'MAE: {mae_best:.4f}')



Лучшие метрики модели:
MSE: 2.6836
MAE: 0.8418
