In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, ensemble
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials
import optuna

##### Прочитаем данные.

In [2]:
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


##### Выделим факторы и целевой признак в отдельные переменные.

In [3]:
X = data.drop(columns='Activity')
y = data['Activity']

##### Разделим данные на тренировочный, валидационный и тестовый наборы.

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=42, stratify=y_valid)

### Логистическая регрессия

##### Рассчет метрики F1-score без подбора гиперпараметров.

In [5]:
log_reg = linear_model.LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print('f1_score на тестовом наборе без подбора гиперпараметров: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))

f1_score на тестовом наборе без подбора гиперпараметров: 0.76


##### Рассчет метрики F1-score с подбором гиперпараметров методом GridSearchCV.

In [6]:
param_grid = [
    {
        'penalty': ['l1', 'l2', 'elaticnet', None],
        'solver': ['saga'],
        'C': np.linspace(0.1, 1, 7)
    },
    {
        'penalty': ['l2'],
        'solver': ['sag', 'liblinear'],
        'C': np.linspace(0.1, 1, 7)
    }
]

grid_search_log = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1
)
%time grid_search_log.fit(X_valid, y_valid)
y_pred = grid_search_log.predict(X_test)
print('f1_score на тестовом наборе с подбором гиперпараметров методом GridSearchCV: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_log.best_params_))

35 fits failed out of a total of 210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packa

CPU times: user 1.78 s, sys: 176 ms, total: 1.95 s
Wall time: 44.9 s
f1_score на тестовом наборе с подбором гиперпараметров методом GridSearchCV: 0.73
Наилучшие значения гиперпараметров: {'C': 0.25, 'penalty': 'l1', 'solver': 'saga'}


##### Рассчет метрики F1-score с подбором гиперпараметров методом RandomizedSearchCV.

In [7]:
param_distributions = [
    {
        'penalty': ['l1', 'l2', 'elaticnet', None],
        'solver': ['saga'],
        'C': np.linspace(0.1, 1, 101)
    },
    {
        'penalty': ['l2'],
        'solver': ['sag', 'liblinear'],
        'C': np.linspace(0.1, 1, 101)
    }
]

rand_search_log = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    n_jobs=-1
)
%time rand_search_log.fit(X_valid, y_valid)
y_pred = rand_search_log.predict(X_test)
print('f1_score на тестовом наборе с подбором гиперпараметров методом RandomizedSearchCV: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))
print("Наилучшие значения гиперпараметров: {}".format(rand_search_log.best_params_))

30 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/Users/familyafanasyevy/Library/Python/3.9/lib/python/site-packa

CPU times: user 1.43 s, sys: 122 ms, total: 1.55 s
Wall time: 57.1 s
f1_score на тестовом наборе с подбором гиперпараметров методом RandomizedSearchCV: 0.69
Наилучшие значения гиперпараметров: {'solver': 'sag', 'penalty': 'l2', 'C': 0.30700000000000005}


### Случайный лес

##### Обучим модель случайного леса без подбора гиперпараметров.

In [16]:
random_forest = ensemble.RandomForestClassifier(class_weight='balanced', random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print('f1_score на тестовом наборе без подбора гиперпараметров: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))

f1_score на тестовом наборе без подбора гиперпараметров: 0.81


##### Зададим пространство поиска гиперпараметров.

In [22]:
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 501, 100),
    'max_depth': hp.quniform('max_depth', 10, 31, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 11, 1)
}

In [23]:
random_state = 42
def hyperopt_rf(params, cv=5, X=X_valid, y=y_valid, random_state=random_state):
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
              'min_samples_leaf': int(params['min_samples_leaf'])
              }
  
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

##### Начинаем подбор гиперпараметров.

In [25]:
%time

trials = Trials()

best = fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=50,
          trials=trials,
          rstate=np.random.default_rng(random_state)
         )
print("Наилучшие значения гиперпараметров {}".format(best))

random_forest = ensemble.RandomForestClassifier(random_state=42,
                                                class_weight='balanced',
                                                n_estimators=int(best['n_estimators']),
                                                max_depth=int(best['max_depth']),
                                                min_samples_leaf=int(best['min_samples_leaf'])
                                                )
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print('f1_score на тестовом наборе с подбором гиперпараметров с помощью Hyperopt: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 2.15 µs
100%|██████████| 50/50 [00:21<00:00,  2.33trial/s, best loss: -0.7267567693157784]
Наилучшие значения гиперпараметров {'max_depth': 23.0, 'min_samples_leaf': 3.0, 'n_estimators': 200.0}
f1_score на тестовом наборе с подбором гиперпараметров: 0.80


##### Настроим оптимизацию гиперпараметров с помощью Optuna для алгоритма случайного леса.

In [32]:
def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 500, 100)
  max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

  # создаем модель
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_valid, y_valid)
  score = metrics.f1_score(y_train, model.predict(X_train))

  return score

In [33]:
%time
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_rf, n_trials=30)

[32m[I 2023-02-14 00:44:15,868][0m A new study created in memory with name: RandomForestClassifier[0m


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs


[32m[I 2023-02-14 00:44:16,118][0m Trial 0 finished with value: 0.7610671357373204 and parameters: {'n_estimators': 200, 'max_depth': 24, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7610671357373204.[0m
[32m[I 2023-02-14 00:44:16,647][0m Trial 1 finished with value: 0.765634132086499 and parameters: {'n_estimators': 500, 'max_depth': 18, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.765634132086499.[0m
[32m[I 2023-02-14 00:44:17,067][0m Trial 2 finished with value: 0.7607361963190185 and parameters: {'n_estimators': 400, 'max_depth': 27, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.765634132086499.[0m
[32m[I 2023-02-14 00:44:17,583][0m Trial 3 finished with value: 0.7675707203266258 and parameters: {'n_estimators': 500, 'max_depth': 20, 'min_samples_leaf': 10}. Best is trial 3 with value: 0.7675707203266258.[0m
[32m[I 2023-02-14 00:44:18,000][0m Trial 4 finished with value: 0.7607361963190185 and parameters: {'n_estimators': 400, 'max_depth': 2

In [36]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))

Наилучшие значения гиперпараметров {'n_estimators': 400, 'max_depth': 12, 'min_samples_leaf': 2}


In [37]:
random_forest = ensemble.RandomForestClassifier(**study.best_params,random_state=random_state)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print('f1_score на тестовом наборе с подбором гиперпараметров с помощью Optuna: {:.2f}'.format(metrics.f1_score(y_test, y_pred)))

f1_score на тестовом наборе с подбором гиперпараметров с помощью Optuna: 0.81
