In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
from scipy.stats import uniform, randint

In [2]:
# Загрузка данных с рассчитанными фингерпринтами
# Считались в calc_features.py
features_path = '../data/processed/'
features_file = 'ecfp:4_features.parquet'
features_df = pd.read_parquet(os.path.join(features_path, features_file))

# Загрузка данных с исходными SMILES и целевыми метками
data_path = '/home/nikolenko/work/Project/ChemRar/data/raw/data.csv'
data_df = pd.read_csv(data_path)


In [3]:
# Загрузка сплита
split_path = "/home/nikolenko/work/Project/ChemRar/data/processed/random_split.csv"
split_df = pd.read_csv(split_path)

# Функция для получения фичей по сплиту
def get_features_for_split(smiles_split, features_df):
    return features_df[features_df['SMILES'].isin(smiles_split)].reset_index(drop=True)

# Получение тренировочных, валидационных и тестовых данных на основе сплита
train_smiles = split_df['Train_SMILES'].dropna().values
val_smiles = split_df['Val_SMILES'].dropna().values
test_smiles = split_df['Test_SMILES'].dropna().values

X_train_df = get_features_for_split(train_smiles, features_df)
X_val_df = get_features_for_split(val_smiles, features_df)
X_test_df = get_features_for_split(test_smiles, features_df)


In [4]:
# Объединение с целевыми метками
train_data = pd.merge(X_train_df, data_df[['SMILES', 'Activity']], on='SMILES')
val_data = pd.merge(X_val_df, data_df[['SMILES', 'Activity']], on='SMILES')
test_data = pd.merge(X_test_df, data_df[['SMILES', 'Activity']], on='SMILES')

# Разделение данных на признаки и целевую переменную
X_train = np.vstack(train_data['ecfp:4_features'].values)
y_train = train_data['Activity'].apply(lambda x: 1 if x == 'Active' else 0)

X_val = np.vstack(val_data['ecfp:4_features'].values)
y_val = val_data['Activity'].apply(lambda x: 1 if x == 'Active' else 0)

X_test = np.vstack(test_data['ecfp:4_features'].values)
y_test = test_data['Activity'].apply(lambda x: 1 if x == 'Active' else 0)


In [5]:
# Проверка на пропущенные значения
data_df.isnull().sum()

SMILES      0
Activity    0
dtype: int64

In [6]:
# Распределение целевого признака в процентах
activity_counts = data_df['Activity'].value_counts(normalize=True) * 100

for index, value in activity_counts.items():
    print(f'{index}: {value:.2f}%')

Inactive: 99.49%
Active: 0.51%


In [7]:
# Использование SMOTE для балансировки тренировочных данных
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


Если в нашем наборе данных 0,51% — это класс "Active", а 99,49% — "Inactive", тогда scale_pos_weight можно установить как примерно 99.49 / 0.51 ≈ 195

In [8]:
model = xgb.XGBClassifier(
    n_estimators=1500,
    max_depth=3,
    min_child_weight=10,
    learning_rate=0.1,
    reg_lambda=0.2,
    reg_alpha=5,
    colsample_bytree=0.5,
    subsample=0.7,
    scale_pos_weight=195,
    eval_metric='logloss',
    tree_method='hist',
    device='cuda'
)

In [9]:
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-logloss:0.60136
[1]	validation_0-logloss:0.53050
[2]	validation_0-logloss:0.47089
[3]	validation_0-logloss:0.41996
[4]	validation_0-logloss:0.37397
[5]	validation_0-logloss:0.33644
[6]	validation_0-logloss:0.30326
[7]	validation_0-logloss:0.27524
[8]	validation_0-logloss:0.24938
[9]	validation_0-logloss:0.22742
[10]	validation_0-logloss:0.20722
[11]	validation_0-logloss:0.18799
[12]	validation_0-logloss:0.17229
[13]	validation_0-logloss:0.15704
[14]	validation_0-logloss:0.14427
[15]	validation_0-logloss:0.13238
[16]	validation_0-logloss:0.12178
[17]	validation_0-logloss:0.11262
[18]	validation_0-logloss:0.10379
[19]	validation_0-logloss:0.09583
[20]	validation_0-logloss:0.08855
[21]	validation_0-logloss:0.08141
[22]	validation_0-logloss:0.07608
[23]	validation_0-logloss:0.07088
[24]	validation_0-logloss:0.06614
[25]	validation_0-logloss:0.06141
[26]	validation_0-logloss:0.05787
[27]	validation_0-logloss:0.05410
[28]	validation_0-logloss:0.05044
[29]	validation_0-loglos

In [10]:
# Предсказания на тестовом наборе
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [11]:
# Оценка модели
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(recall_vals, precision_vals)

conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("="*30)
print("Model Performance Metrics")
print("="*30)
print(f"Accuracy:         {accuracy:.6f}")
print(f"Precision:        {precision:.6f}")
print(f"Recall:           {recall:.6f}")
print(f"F1 Score:         {f1:.6f}")
print(f"ROC AUC:          {roc_auc:.6f}")
print(f"PR AUC:           {pr_auc:.6f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)
print("="*30)

Model Performance Metrics
Accuracy:         0.999891
Precision:        0.984962
Recall:           0.994937
F1 Score:         0.989924
ROC AUC:          0.998653
PR AUC:           0.995960

Confusion Matrix:
[[73092     6]
 [    2   393]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73098
           1       0.98      0.99      0.99       395

    accuracy                           1.00     73493
   macro avg       0.99      1.00      0.99     73493
weighted avg       1.00      1.00      1.00     73493



### Гиперпараметры

Подбирались в 
```
scripts/train_xgboost.py
```