In [11]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys
from rdkit.Chem import rdFingerprintGenerator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import joblib

In [None]:
inhibitors_filepath = "../data/SARS-COV-2/high_inhibitory_compounds/"
inhibitors_nature = pd.read_csv(filepath_or_buffer=inhibitors_filepath + 'nature/processed_data/sars-cov-2_inhibitors_nature.csv')
inhibitors_pubs = pd.read_csv(filepath_or_buffer=inhibitors_filepath + 'pubs/processed_data/sars-cov-2_inhibitors_pubs.csv')
inhibitors_sciencedirect = pd.read_csv(filepath_or_buffer=inhibitors_filepath + 'sciencedirect/processed_data/sars-cov-2_inhibitors_sciencedirect.csv')



inhibitors_nature.drop(columns=['inhibitor_name', 'CC50_µM', 'EC50_µM'], inplace=True, axis=1)
inhibitors_pubs.drop(columns=['compound_name', 'IC50_μM', 'CC50_μM'], inplace=True, axis=1)
inhibitors_sciencedirect.drop(columns=['inhibitor_name', 'IC50_μmol/L', 'CC50_μmol/L'], inplace=True, axis=1)

inhibitors_df = pd.concat([inhibitors_nature, inhibitors_pubs, inhibitors_sciencedirect], axis=0)

inhibitors_df.drop_duplicates(subset='smiles', inplace=True)

print(inhibitors_df.shape)

inhibitors_df.head()

In [None]:
non_inhibitors_df = pd.read_csv(filepath_or_buffer='../data/SARS-COV-2/low_inhibitory_compounds/processed_data/non_inhibitors_db.csv')
non_inhibitors_df.drop(columns=['ligand_name', 'IC50_nM_parsed'], inplace=True)

complete_data = pd.concat([inhibitors_df, non_inhibitors_df], axis=0)

complete_data.shape

In [None]:
def remove_outliers(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    # Определение маски для выбросов
    mask = ~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    
    # Возвращаем DataFrame с удалёнными выбросами для указанных столбцов
    return df[mask]



complete_data = pd.concat([remove_outliers(inhibitors_df, ['SI']), non_inhibitors_df], axis=0)

print(inhibitors_df.shape[0] + non_inhibitors_df.shape[0])
print(complete_data.shape[0])


complete_data

In [15]:
def extract_morgan_fingerprint(smiles, radius=2, nBits=2048):
    # Преобразуем SMILES в молекулу
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None
    
    # Создаем генератор Morgan fingerprints
    fpgen = AllChem.GetMorganGenerator(radius=radius, fpSize=nBits)
    
    # Генерируем fingerprint в виде битовой векторной формы
    morgan_fp = fpgen.GetFingerprint(mol)
    
    # Преобразуем результат в массив numpy
    morgan_fp_arr = np.array(morgan_fp)
    
    return morgan_fp_arr

def extract_features(smiles):
    # Преобразование SMILES в молекулу RDKit
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None
    
    # Вычисление молекулярной массы
    molecular_weight = Descriptors.MolWt(mol)
    
    # Вычисление LogP (коэффициента распределения)
    logP = Descriptors.MolLogP(mol)
    
    # Число водородных доноров (HBD)
    hbd = Descriptors.NumHDonors(mol)
    
    # Число водородных акцепторов (HBA)
    hba = Descriptors.NumHAcceptors(mol)
    
    # Топологическая полярная поверхность (TPSA)
    tpsa = Descriptors.TPSA(mol)
    
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    fingerprint = rdkit_gen.GetFingerprint(mol)
    
    

    return {
        'MolecularWeight': molecular_weight,
        'LogP': logP,
        'HBD': hbd,
        'HBA': hba,
        'TPSA': tpsa,
        'fingerprint' : fingerprint
    }

In [None]:
features_df = complete_data['smiles'].apply(extract_features)

features_expanded_df = pd.DataFrame(features_df.tolist())


complete_data.reset_index(drop=True, inplace=True)
features_expanded_df.reset_index(drop=True, inplace=True)

complete_data_with_features = pd.concat([complete_data, features_expanded_df], axis=1, ignore_index=False)

print(complete_data_with_features.dtypes)

complete_data_with_features.head()

In [18]:
selective_index = complete_data_with_features['SI']
selective_index = np.array(selective_index)
selective_index = selective_index.flatten()

In [None]:
compounds = []
for smile, mw, logp, hbd, hba, tpsa in complete_data_with_features[['smiles', 'MolecularWeight', 'LogP', 'HBD', 'HBA', 'TPSA']].itertuples(index=False):
    compounds.append((Chem.MolFromSmiles(smile), mw, logp, hbd, hba, tpsa))
rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
fingerprints = np.array([rdkit_gen.GetFingerprint(mol) for mol, _, _, _, _, _ in compounds])
properties = np.array([[mw, logp, hbd, hba, tpsa] for _, mw, logp, hbd, hba, tpsa in compounds])


combined_data = np.hstack((fingerprints, properties))

combined_data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(combined_data, selective_index, test_size = 0.2)

In [21]:
xgb_model = XGBRegressor(random_state=1)

# Определение параметров для перебора
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Настройка RandomizedSearchCV
randomized_search_xgb = RandomizedSearchCV(estimator=xgb_model, 
                                            param_distributions=param_dist, 
                                            n_iter=500,  # Количество итераций
                                            cv=5,  # Количество фолдов для кросс-валидации
                                            random_state=42,
                                            error_score='raise')

In [None]:
print(X_train.shape)
randomized_search_xgb.fit(X_train, y_train)

In [None]:
tuned_xgb_model = randomized_search_xgb.best_estimator_
joblib.dump(tuned_xgb_model, 'sars-cov-2_SI_predictive_model.joblib')

In [None]:
mae_xgb = mean_absolute_error(y_test, tuned_xgb_model.predict(X_test))
# print(f"MAE of tuned RF model: {mae_rf}")
print(f"MAE of tuned XGB model: {mae_xgb}")