In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier,
                             GradientBoostingClassifier,
                             HistGradientBoostingClassifier,
                             AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, roc_auc_score,
                           f1_score, precision_score,
                           recall_score, confusion_matrix)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel("Данные_для_курсовои_Классическое_МО.xlsx").drop(columns=['Unnamed: 0'])
df

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.239374,175.482382,28.125000,5.094096,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,...,0,0,0,0,0,0,0,0,3,0
1,0.771831,5.402819,7.000000,3.961417,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,...,0,0,0,0,0,0,0,0,3,0
2,223.808778,161.142320,0.720000,2.627117,2.627117,0.543231,0.543231,0.260923,42.187500,446.808,...,0,0,0,0,0,0,0,0,3,0
3,1.705624,107.855654,63.235294,5.097360,5.097360,0.390603,0.390603,0.377846,41.862069,398.679,...,0,0,0,0,0,0,0,0,4,0
4,107.131532,139.270991,1.300000,5.150510,5.150510,0.270476,0.270476,0.429038,36.514286,466.713,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,31.000104,34.999650,1.129017,12.934891,12.934891,0.048029,-0.476142,0.382752,49.133333,414.542,...,0,0,0,0,0,0,0,0,0,0
997,31.999934,33.999415,1.062484,13.635345,13.635345,0.030329,-0.699355,0.369425,44.542857,485.621,...,0,0,0,0,0,0,0,0,0,0
998,30.999883,33.999458,1.096761,13.991690,13.991690,0.026535,-0.650790,0.284923,41.973684,545.742,...,1,0,0,0,0,0,0,0,0,0
999,31.998959,32.999644,1.031272,13.830180,13.830180,0.146522,-1.408652,0.381559,39.000000,522.635,...,0,0,0,0,0,0,0,0,0,0


In [3]:
print(f"Исходное количество записей: {len(df)}")

df = df[(df['IC50, mM'] >= df['IC50, mM'].mean() - 2 * df['IC50, mM'].std()) & (df['IC50, mM'] <= df['IC50, mM'].mean() + 2 * df['IC50, mM'].std())].copy()
print(f"Количество записей после удаления выбросов: {len(df)}")

# Создание целевой переменной
df['IC50_more_then_median'] = (df['IC50, mM'] > df['IC50, mM'].median()).astype(int)

Исходное количество записей: 1001
Количество записей после удаления выбросов: 945


In [4]:
X = df.drop(columns=['IC50, mM', 'CC50, mM', 'SI', 'IC50_more_then_median'])
y = df['IC50_more_then_median']



if 'MolLogP' in X.columns and 'MolWt' in X.columns:
    X['MolLogP_x_MolWt'] = X['MolLogP'] * X['MolWt']

polynomial_features_cols = ['MolLogP', 'MolWt']
existing_poly_cols = [col for col in polynomial_features_cols if col in X.columns]

if existing_poly_cols:
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(X[existing_poly_cols])
    poly_feature_names = poly.get_feature_names_out(existing_poly_cols)

    new_poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=X.index)
    for col in new_poly_df.columns:
        if col not in X.columns:
            X[col] = new_poly_df[col]

if 'MolLogP' in X.columns:
    X['MolLogP_gt_3'] = (X['MolLogP'] > 3).astype(int)

if X.isnull().values.any():
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X),
                               columns=X.columns)
     

In [5]:
# Модели
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'Stacking': StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42)),
            ('xgb', XGBClassifier(random_state=42)),
            ('gb', GradientBoostingClassifier(random_state=42))
        ],
        final_estimator=LogisticRegression()
    )
}

def evaluate_classifier(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                       random_state=42, stratify=y)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    metrics = {
        'Model': model.__class__.__name__,
        'Accuracy': accuracy_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan,
        'F1': f1_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred)
    }
    return metrics

In [6]:
results = []
best_model = None
best_roc_auc = -np.inf

for name, model in models.items():
    metrics = evaluate_classifier(model, X, y)
    results.append(metrics)

    if metrics['ROC AUC'] > best_roc_auc:
        best_roc_auc = metrics['ROC AUC']
        best_model = model

[WinError 2] Не удается найти указанный файл
  File "e:\Anaconda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "e:\Anaconda\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\Anaconda\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "e:\Anaconda\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [7]:
result = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False).round(3)
result

Unnamed: 0,Model,Accuracy,ROC AUC,F1,Precision,Recall
3,GradientBoostingClassifier,0.767,0.844,0.758,0.784,0.734
6,CatBoostClassifier,0.746,0.838,0.742,0.75,0.734
4,HistGradientBoostingClassifier,0.757,0.83,0.753,0.761,0.745
5,XGBClassifier,0.767,0.828,0.758,0.784,0.734
7,StackingClassifier,0.73,0.828,0.724,0.736,0.713
2,RandomForestClassifier,0.725,0.816,0.72,0.728,0.713
0,LogisticRegression,0.529,0.581,0.136,0.778,0.074
1,KNeighborsClassifier,0.54,0.57,0.514,0.541,0.489


**Вывод**  

GradientBoostingClassifier и XGBClassifier показывают наивысшую точность и хороший баланс между Precision и Recall, что говорит о стабильности предсказаний.

KNN и Logistic Regression явно не подходят для данной задачи