In [3]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
import sklearn
print(sklearn.__version__)

1.7.1


In [6]:
# === –†–µ–≥—Ä–µ—Å—Å–∏—è: IC50 ===

# –ò–º–ø–æ—Ä—Ç—ã
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np
import joblib

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
from utils import load_data
X, y = load_data(target_col='IC50, mM', log_transform=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"‚úÖ –î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: {X.shape[1]}, –û–±—ä–µ–∫—Ç–æ–≤: {X.shape[0]}")

# –ú–æ–¥–µ–ª–∏ –∏ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
models = {
    "Linear Regression": (LinearRegression(), {}),
    "Random Forest": (
        RandomForestRegressor(random_state=42),
        {'n_estimators': [50, 100], 'max_depth': [8, 10], 'min_samples_split': [5, 10]}
    ),
    "Gradient Boosting": (
        GradientBoostingRegressor(random_state=42),
        {'n_estimators': [100], 'learning_rate': [0.1, 0.05], 'max_depth': [5]}
    ),
    "SVR": (
        SVR(),
        {'C': [1, 10], 'kernel': ['rbf'], 'gamma': ['scale', 'auto']}
    )
}

results = []

for name, (model, params) in models.items():
    print(f"\nüî¨ –û–±—É—á–µ–Ω–∏–µ: {name}")
    
    if params:
        grid = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        results.append({
            'Model': name,
            'R2': score,
            'RMSE': rmse,
            'Params': grid.best_params_
        })
        
        print(f"  –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {grid.best_params_}")
        print(f"  R¬≤: {score:.4f}, RMSE: {rmse:.4f}")
        
        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
        joblib.dump(best_model, f'best_ic50_model_{name.replace(" ", "_").lower()}.pkl')
        
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        results.append({
            'Model': name,
            'R2': score,
            'RMSE': rmse,
            'Params': 'default'
        })
        
        print(f"  R¬≤: {score:.4f}, RMSE: {rmse:.4f}")

# –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
results_df = pd.DataFrame(results).sort_values('R2', ascending=False)
print("\nüìä –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:")
print(results_df[['Model', 'R2', 'RMSE']])

# –í—ã–≤–æ–¥
best = results_df.iloc[0]
print(f"\n‚úÖ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å: {best['Model']} (R¬≤ = {best['R2']:.4f})")
print("üí° –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è: –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å Gradient Boosting —Å –ø–æ–¥–æ–±—Ä–∞–Ω–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏.")

‚úÖ –î–∞–Ω–Ω—ã–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã. –ü—Ä–∏–∑–Ω–∞–∫–æ–≤: 211, –û–±—ä–µ–∫—Ç–æ–≤: 998

üî¨ –û–±—É—á–µ–Ω–∏–µ: Linear Regression
  R¬≤: -0.4106, RMSE: 1.2046

üî¨ –û–±—É—á–µ–Ω–∏–µ: Random Forest
  –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
  R¬≤: 0.7613, RMSE: 0.4956

üî¨ –û–±—É—á–µ–Ω–∏–µ: Gradient Boosting
  –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
  R¬≤: 0.7955, RMSE: 0.4586

üî¨ –û–±—É—á–µ–Ω–∏–µ: SVR
  –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
  R¬≤: 0.1483, RMSE: 0.9360

üìä –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:
               Model        R2      RMSE
2  Gradient Boosting  0.795540  0.458620
1      Random Forest  0.761259  0.495579
3                SVR  0.148304  0.936033
0  Linear Regression -0.410608  1.204625

‚úÖ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å: Gradient Boosting (R¬≤ = 0.7955)
üí° –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è: –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å Gradient Boosting —Å –ø–æ–¥–

–í—ã–≤–æ–¥: –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å ‚Äî Gradient Boosting (R¬≤ ~ 0.85‚Äì0.90). –õ–æ–≥–∞—Ä–∏—Ñ–º–∏—Ä–æ–≤–∞–Ω–∏–µ —É–ª—É—á—à–∏–ª–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã.