# Классификация: CC50 > медианы

In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# Загрузка данных
df = pd.read_csv('/Users/rem/МИФИ/курсовая/dataset.csv')

# Подготовка данных: исключаем столбцы с целевыми переменными
features = [col for col in df.columns if col not in ["IC50, mM", "CC50, mM", "SI"]]
X = df[features]
y = (df["CC50, mM"] > df['CC50, mM'].median()).astype(int)


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Для моделей на основе градиентного бустинга (LightGBM) необходимо очистить имена признаков от специальных символов
def clean_column(name):
    return re.sub(r'[^\w]', '_', name)

X_train_clean = X_train.rename(columns=lambda col: clean_column(col))
X_test_clean = X_test.rename(columns=lambda col: clean_column(col))

# Обучение моделей
logit = LogisticRegression(max_iter=1000)
logit.fit(X_train_scaled, y_train)
y_pred_logit = logit.predict(X_test_scaled)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

lgb = LGBMClassifier()
lgb.fit(X_train_clean, y_train)
y_pred_lgb = lgb.predict(X_test_clean)

for name, preds, clf, xtest in [("Logistic Regression", y_pred_logit, logit, X_test_scaled),
                                ("Random Forest", y_pred_rf, rf, X_test),
                                ("XGBoost", y_pred_xgb, xgb, X_test),
                                ("LightGBM", y_pred_lgb, lgb, X_test)]:
    print(f"{name}: Accuracy={accuracy_score(y_test, preds):.3f}, Precision={precision_score(y_test, preds):.3f}, Recall={recall_score(y_test, preds):.3f}, F1={f1_score(y_test, preds):.3f}, AUC={roc_auc_score(y_test, clf.predict_proba(xtest)[:, 1]):.3f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 385, number of negative: 387
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18285
[LightGBM] [Info] Number of data points in the train set: 772, number of used features: 169
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498705 -> initscore=-0.005181
[LightGBM] [Info] Start training from score -0.005181
Logistic Regression: Accuracy=0.675, Precision=0.663, Recall=0.711, F1=0.687, AUC=0.777
Random Forest: Accuracy=0.727, Precision=0.750, Recall=0.680, F1=0.714, AUC=0.805
XGBoost: Accuracy=0.747, Precision=0.755, Recall=0.732, F1=0.743, AUC=0.817
LightGBM: Accuracy=0.732, Precision=0.742, Recall=0.711, F1=0.726, AUC=0.811
