In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [6]:
df = pd.read_excel("Данные_для_курсовои_Классическое_МО.xlsx").drop(columns=['Unnamed: 0'])
df

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.239374,175.482382,28.125000,5.094096,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,...,0,0,0,0,0,0,0,0,3,0
1,0.771831,5.402819,7.000000,3.961417,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,...,0,0,0,0,0,0,0,0,3,0
2,223.808778,161.142320,0.720000,2.627117,2.627117,0.543231,0.543231,0.260923,42.187500,446.808,...,0,0,0,0,0,0,0,0,3,0
3,1.705624,107.855654,63.235294,5.097360,5.097360,0.390603,0.390603,0.377846,41.862069,398.679,...,0,0,0,0,0,0,0,0,4,0
4,107.131532,139.270991,1.300000,5.150510,5.150510,0.270476,0.270476,0.429038,36.514286,466.713,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,31.000104,34.999650,1.129017,12.934891,12.934891,0.048029,-0.476142,0.382752,49.133333,414.542,...,0,0,0,0,0,0,0,0,0,0
997,31.999934,33.999415,1.062484,13.635345,13.635345,0.030329,-0.699355,0.369425,44.542857,485.621,...,0,0,0,0,0,0,0,0,0,0
998,30.999883,33.999458,1.096761,13.991690,13.991690,0.026535,-0.650790,0.284923,41.973684,545.742,...,1,0,0,0,0,0,0,0,0,0
999,31.998959,32.999644,1.031272,13.830180,13.830180,0.146522,-1.408652,0.381559,39.000000,522.635,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = df.drop(columns=['IC50, mM', 'CC50, mM', 'SI'])
y = df['IC50, mM']

# Добавление новых признаков
if 'MolLogP' in X.columns and 'MolWt' in X.columns:
    X['MolLogP_x_MolWt'] = X['MolLogP'] * X['MolWt']

# Полиномиальные признаки
polynomial_features_cols = ['MolLogP', 'MolWt']
existing_poly_cols = [col for col in polynomial_features_cols if col in X.columns]

if existing_poly_cols:
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(X[existing_poly_cols])
    poly_feature_names = poly.get_feature_names_out(existing_poly_cols)

    new_poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=X.index)
    for col in new_poly_df.columns:
        if col not in X.columns:
            X[col] = new_poly_df[col]

if 'MolLogP' in X.columns:
    X['MolLogP_gt_3'] = (X['MolLogP'] > 3).astype(int)

if X.isnull().values.any():
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


In [None]:
# Модели
models = {
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'HistGradientBoosting': HistGradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
    'Stacking': StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(random_state=42)),
            ('gb', GradientBoostingRegressor(random_state=42)),
            ('xgb', XGBRegressor(random_state=42))
        ],
        final_estimator=LinearRegression()
    )
}

def evaluate_model(name, model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return {'Model': name, 'MSE': mse, 'RMSE': rmse, 'R2': r2}, model


In [None]:
results = []
best_model = None
best_r2 = -np.inf

for name, model in models.items():
    metrics, trained_model = evaluate_model(name, model, X, y)
    results.append(metrics)

    if metrics['R2'] > best_r2:
        best_r2 = metrics['R2']
        best_model = trained_model

In [9]:
result = pd.DataFrame(results).sort_values(by='R2', ascending=False).round(3)
result

Unnamed: 0,Model,MSE,RMSE,R2
1,Random Forest,194487.812,441.008,0.417
6,CatBoost,194739.62,441.293,0.416
5,XGBoost,197471.17,444.377,0.408
3,HistGradientBoosting,198022.637,444.997,0.406
2,Gradient Boosting,198705.973,445.764,0.404
7,Stacking,208355.587,456.46,0.375
4,AdaBoost,214720.287,463.379,0.356
0,KNN,274384.737,523.817,0.177


**Вывод**

Все модели объясняют менее 50% дисперсии и дают схожие результаты:

Возможные причины:

Недостаточно информативные признаки

Слишком сложные закономерности в данных

Наличие шума/выбросов
