# Прогнозирование биологического ответа

Необходимо обучить две модели: логистическую регрессию и случайный лес. Далее нужно сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

В качестве метрики будем использовать F1-score.

Данные представлены в формате CSV.  Каждая строка представляет молекулу.

  Первый столбец Activity содержит экспериментальные данные, описывающие фактический биологический ответ [0, 1];
  Остальные столбцы D1-D1776 представляют собой молекулярные дескрипторы — это вычисляемые свойства, которые могут фиксировать некоторые характеристики молекулы, например размер, форму или состав элементов.



## Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import hyperopt
from hyperopt import fmin, tpe, hp, Trials
import optuna

## Load Data

In [2]:
# # connecting google drive
# from google.colab import drive
# drive.mount('/content/drive')
# # path to the file
# data = pd.read_csv('./drive/MyDrive/skillfactory_data/ML7/train_sem09.csv')
# data.head(3)

data = pd.read_csv('dataML7/train_sem09.csv')
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


## Data preparation

In [3]:
# split data into features and target variable
X = data.drop('Activity', axis=1)
y = data['Activity']

In [4]:
# check if our data is balanced
y.value_counts(normalize=True)

Activity
1    0.542255
0    0.457745
Name: proportion, dtype: float64

In [5]:
# create Train and Test data sets # Do not stratify until the threshold is 90/10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

### GridSearchCV

In [14]:
# initializing LogisticRegression model
model = LogisticRegression(random_state=42, max_iter=3000)
# defining hyperparameters
param_grid = [ {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']
              },
               {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                'penalty': ['l2'],
                'solver': ['lbfgs', 'sag', 'newton-cg'],
                }
]

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
%time grid_search.fit(X_train, y_train)

print(f'The F1 score on X_train using GridSearchCV is {grid_search.score(X_train, y_train)}')
print(f'The F1 score on X_test using GridSearchCV is {grid_search.score(X_test, y_test)}')
print(f'The best parameters are {grid_search.best_params_}')

CPU times: user 2.54 s, sys: 1.13 s, total: 3.67 s
Wall time: 4min 4s
The F1 score on X_train using GridSearchCV is 0.8515337423312883
The F1 score on X_test using GridSearchCV is 0.7945516458569808
The best parameters are {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


### RandomizedSearchCV

In [16]:
model = LogisticRegression(random_state=42, max_iter=3000)

param_grid = [ {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']},
              
               {'C': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                'penalty': ['l2'],
                'solver': ['lbfgs', 'sag', 'newton-cg']}
             ]

randomized_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1, n_iter=50)
%time randomized_search.fit(X_train, y_train)

print(f'The F1 score on X_train using RandomizedSearchCV is {randomized_search.score(X_train, y_train)}')
print(f'The F1 score on X_test using RandomizedSearchCV is {randomized_search.score(X_test, y_test)}')
print(f'The best parameters are {randomized_search.best_params_}')

CPU times: user 3.12 s, sys: 1.74 s, total: 4.86 s
Wall time: 2min 57s
The F1 score on X_train using RandomizedSearchCV is 0.8515337423312883
The F1 score on X_test using RandomizedSearchCV is 0.7945516458569808
The best parameters are {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.1}


### HyperOpt

In [107]:
# setup the space for parameters
space_lr = hp.choice('a', [ 
       {'C': hp.quniform('C1', 0.1, 1, 0.1),
        'penalty': hp.choice('penalty1', ['l1', 'l2']),
        'solver': hp.choice('solver1', ['liblinear', 'saga'])},
       
       {'C': hp.quniform('C2', 0.1, 1, 0.1),
        'penalty': hp.choice('penalty2', ['l2']),
        'solver': hp.choice('solver2', ['lbfgs', 'sag', 'newton-cg'])}
      ])


In [108]:
random_state = 42

def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    model_params = {
        'penalty': params['penalty'],
        'solver': params['solver'],
        'C': params['C']
      }
    model = LogisticRegression(**model_params, random_state=random_state, max_iter=3000)
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1', n_jobs=-1).mean()
    return -score

In [109]:
%%time
trials_lr = Trials()
best=fmin(hyperopt_lr, # our function
          space=space_lr, # space of hyperparameters
          algo=tpe.suggest, # optimization algorithm
          max_evals=50, # max number of iterations
          trials=trials_lr, # logging results
          rstate=np.random.default_rng(random_state)# setting random state to replicate results
         )


100%|████████| 50/50 [04:06<00:00,  4.92s/trial, best loss: -0.7847204530866303]
CPU times: user 1.05 s, sys: 310 ms, total: 1.35 s
Wall time: 4min 6s


In [114]:
# Best hyperparameters
print(hyperopt.space_eval(space_lr, best))

{'C': 0.30000000000000004, 'penalty': 'l1', 'solver': 'liblinear'}


In [115]:
model = LogisticRegression(C=0.3, penalty='l1', solver='liblinear', random_state=random_state, max_iter=3000)

model.fit(X_train, y_train)

print(f'The F1 score on X_train using HyperOpt is {metrics.f1_score(y_train, model.predict(X_train))}')
print(f'The F1 score on X_test using HyperOpt is {metrics.f1_score(y_test, model.predict(X_test))}')

The F1 score on X_train using HyperOpt is 0.8352654057352045
The F1 score on X_test using HyperOpt is 0.8


### Optuna

In [26]:
random_state = 42

def optuna_lr(trial, X=X_train, y=y_train, cv=5):
    C = trial.suggest_float('C', 0.1, 1, step=0.1)
    penalty = trial.suggest_categorical('penalty', ['l2'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cg', 'liblinear', 'saga'])
    
    model = LogisticRegression(C=C, penalty=penalty, solver=solver, random_state=random_state, max_iter=3000)
    
    model.fit(X, y)
    
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1', n_jobs=-1).mean()
    
    return score

In [27]:
%%time

study = optuna.create_study(study_name='LogisticRegression', direction='maximize')

study.optimize(optuna_lr, n_trials=50)

[I 2023-12-12 05:51:22,319] A new study created in memory with name: LogisticRegression
[I 2023-12-12 05:51:50,206] Trial 0 finished with value: 0.7711740623707684 and parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 0 with value: 0.7711740623707684.
[I 2023-12-12 05:51:52,679] Trial 1 finished with value: 0.7772292673632061 and parameters: {'C': 0.4, 'penalty': 'l2', 'solver': 'newton-cg'}. Best is trial 1 with value: 0.7772292673632061.
[I 2023-12-12 05:51:54,878] Trial 2 finished with value: 0.7712549531969394 and parameters: {'C': 0.6, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7772292673632061.
[I 2023-12-12 05:51:55,975] Trial 3 finished with value: 0.7802722359939726 and parameters: {'C': 0.2, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 3 with value: 0.7802722359939726.
[I 2023-12-12 05:51:57,674] Trial 4 finished with value: 0.7809846733373472 and parameters: {'C': 0.30000000000000004, 'penalty': 'l2', 'solver': 'lbfgs'

CPU times: user 3min 38s, sys: 1min 44s, total: 5min 22s
Wall time: 4min 49s


In [28]:
print('Best hyper parameters settings: ', study.best_params)

Best hyper parameters settings:  {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [29]:
# initialize model with best hyper params
model = LogisticRegression(C=0.1, penalty='l2', solver='newton-cg', random_state=random_state, n_jobs=-1)

# fit model
model.fit(X_train, y_train)

# make predictions for target variable
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

# print F1 score for Train and Test
print('The F1 score for Train data set: ', metrics.f1_score(y_train, y_train_predict))
print('The F1 score for Test data set: ', metrics.f1_score(y_test, y_test_predict))

The F1 score for Train data set:  0.8515337423312883
The F1 score for Test data set:  0.7945516458569808


## Random Forest

### GridSearchCV

In [162]:
random_state = 42

model = ensemble.RandomForestClassifier(random_state=random_state)

params_grid_rf = {'n_estimators': np.linspace(10, 1000, 4, dtype=int),
               'max_depth': np.linspace(5, 100, 3, dtype=int),
               'criterion': ["gini", "entropy"],
               'min_samples_leaf': np.linspace(1, 20, 3, dtype=int)} 

In [163]:
grid_search_rf = GridSearchCV(model, param_grid=params_grid_rf, scoring='f1', cv=5, n_jobs=-1)

%time grid_search_rf.fit(X_train, y_train)

CPU times: user 6.98 s, sys: 211 ms, total: 7.19 s
Wall time: 2min 1s


In [164]:
print('The F1 score on X_train using GridSearch and RandomForestClassifier: ', grid_search_rf.score(X_train, y_train))
print('The F1 score on X_train using GridSearch and RandomForestClassifier: ', grid_search_rf.score(X_test, y_test))
print('The best hyper params are: ', grid_search_rf.best_params_)

The F1 score on X_train using GridSearch and RandomForestClassifier:  1.0
The F1 score on X_train using GridSearch and RandomForestClassifier:  0.8292682926829269
The best hyper params are:  {'criterion': 'gini', 'max_depth': 52, 'min_samples_leaf': 1, 'n_estimators': 670}


### RandomizedSearch

In [157]:
random_state = 42

model = ensemble.RandomForestClassifier(random_state=random_state)

params_grid_rf = {'n_estimators': np.linspace(10, 1000, 5, dtype=int),
               'max_depth': np.linspace(5, 100, 5, dtype=int),
               'criterion': ["gini", "entropy"],
               'min_samples_leaf': np.linspace(1, 20, 5, dtype=int)} 

In [160]:
randomize_search_rf = RandomizedSearchCV(model, param_distributions=params_grid_rf, cv=5, scoring='f1', n_jobs=-1, n_iter=50)

%time randomize_search_rf.fit(X_train, y_train)

CPU times: user 8.93 s, sys: 292 ms, total: 9.22 s
Wall time: 1min 38s


In [161]:
print('The F1 score on X_train using RandomizedSearch and RandomForestClassifier: ', randomize_search_rf.score(X_train, y_train))
print('The F1 score on X_train using RandomizedSearch and RandomForestClassifier: ', randomize_search_rf.score(X_test, y_test))
print('The best hyper params are: ', randomize_search_rf.best_params_)

The F1 score on X_train using RandomizedSearch and RandomForestClassifier:  1.0
The F1 score on X_train using RandomizedSearch and RandomForestClassifier:  0.8310185185185186
The best hyper params are:  {'n_estimators': 752, 'min_samples_leaf': 1, 'max_depth': 28, 'criterion': 'entropy'}


### Hyperopt

In [185]:
# create space
space_rf = {'n_estimators': hp.quniform('n_estimators', 10, 1000, 50),
        'max_depth': hp.quniform('max_depth', 5, 100, 5),
        'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
        'criterion': hp.choice('criterion', ["gini", "entropy", "log_loss"]),
        'max_features': hp.choice('max_features', ["sqrt", "log2", None])}

In [8]:
random_state = 42

# def objective function
def hyperopt_rf(params, X=X_train, y=y_train, cv=5, random_state=random_state):
    
    model_params = {'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'min_samples_leaf': int(params['min_samples_leaf']),
        'criterion': params['criterion'],
        'max_features': params['max_features']}
    # initialize model
    model = ensemble.RandomForestClassifier(**model_params, random_state=random_state)
    
    # cross validation
    score = model_selection.cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1).mean()
    
    return -score

In [187]:
# set to log trials attempt
trials_rf = Trials()

# initialize fmin minimization function
best = fmin(fn=hyperopt_rf, space=space_rf, trials=trials_rf, algo=tpe.suggest, max_evals=50, rstate=np.random.default_rng(random_state))

100%|██████| 50/50 [1:04:16<00:00, 77.13s/trial, best loss: -0.8167261696392407]


In [189]:
# Best hyperparameters
print(hyperopt.space_eval(space_rf, best))

{'criterion': 'entropy', 'max_depth': 65.0, 'max_features': None, 'min_samples_leaf': 1.0, 'n_estimators': 800.0}


In [6]:
# initialize Random Forest model with the best hyper params we received from HyperOpt
model = ensemble.RandomForestClassifier(
    criterion='entropy', 
    max_depth=65, 
    max_features=None, 
    min_samples_leaf=1, 
    n_estimators=800, 
    random_state=42,
    n_jobs=-1
)

In [7]:
# fit the model on Train data set
model.fit(X_train, y_train)

In [8]:
# make predictions for Train and Test data set
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [9]:
# print F1 score for Train and Test
print('The F1 score for Train data set: ', metrics.f1_score(y_train, y_train_predict))
print('The F1 score for Test data set: ', metrics.f1_score(y_test, y_test_predict))

The F1 score for Train data set:  1.0
The F1 score for Test data set:  0.8192219679633868


### Optuna

In [17]:
random_state = 42

def optuna_rf(trial, X=X_train, y=y_train, cv=5):
    n_estimators = trial.suggest_int('n_estimators', 10, 960, step=50)
    max_depth = trial.suggest_int('max_depth', 5, 100, step=5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, step=1)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_leaf=min_samples_leaf, 
        criterion=criterion, 
        max_features=max_features, 
        random_state=random_state
    )
    
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring='f1', n_jobs=-1).mean()
    
    return score


In [18]:
%%time

study_rf = optuna.create_study(study_name='RandomForestClassifier', direction='maximize')

study_rf.optimize(optuna_rf, n_trials=50)

[I 2023-12-11 19:27:33,573] A new study created in memory with name: RandomForestClassifier
[I 2023-12-11 19:27:34,724] Trial 0 finished with value: 0.7615474393656083 and parameters: {'n_estimators': 60, 'max_depth': 15, 'min_samples_leaf': 10, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 0.7615474393656083.
[I 2023-12-11 19:30:38,142] Trial 1 finished with value: 0.8078550752848107 and parameters: {'n_estimators': 960, 'max_depth': 75, 'min_samples_leaf': 8, 'criterion': 'gini', 'max_features': None}. Best is trial 1 with value: 0.8078550752848107.
[I 2023-12-11 19:30:41,376] Trial 2 finished with value: 0.8001491474504924 and parameters: {'n_estimators': 510, 'max_depth': 10, 'min_samples_leaf': 2, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8078550752848107.
[I 2023-12-11 19:30:43,011] Trial 3 finished with value: 0.767584033633215 and parameters: {'n_estimators': 410, 'max_depth': 85, 'min_samples_leaf': 10, 'criterion':

CPU times: user 5.02 s, sys: 2.39 s, total: 7.41 s
Wall time: 1h 13min 27s


In [20]:
# best params
print('The best params using Optuna optimization: ', study_rf.best_params)

The best params using Optuna optimization:  {'n_estimators': 560, 'max_depth': 25, 'min_samples_leaf': 3, 'criterion': 'entropy', 'max_features': None}


In [22]:
# create RandomForest model using the best params
model = ensemble.RandomForestClassifier(
        n_estimators=560, 
        max_depth=25, 
        min_samples_leaf=3, 
        criterion='entropy', 
        max_features=None, 
        random_state=random_state, 
        n_jobs=-1
    )

# fit the model
model.fit(X_train, y_train)

In [23]:
# make predictions
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

# print F1 score for Train and Test
print('The F1 score for Train data set: ', metrics.f1_score(y_train, y_train_predict))
print('The F1 score for Test data set: ', metrics.f1_score(y_test, y_test_predict))

The F1 score for Train data set:  0.9978145488604434
The F1 score for Test data set:  0.8164196123147092
