In [55]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

import optuna
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

In [56]:
# загружаю и смотрю данные

df = pd.read_csv('_train_sem09.csv')

print(df.shape)
df.head()

(3751, 1777)


Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# выделяю таргет
y = df['Activity']
X = df.drop('Activity', axis=1)

print(X.shape, y.shape)

(3751, 1776) (3751,)


In [58]:
# проверяю сбалансированность выборки
y.value_counts(normalize=True)

1    0.542255
0    0.457745
Name: Activity, dtype: float64

In [59]:
# выделяю тестовую выборку

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [60]:
# проверяю метрики со значениями по умолчанию

base_logreg = LogisticRegression(max_iter=1000)

base_logreg.fit(X_train, y_train)

base_logreg_pred = base_logreg.predict(X_test)

print(metrics.f1_score(y_test, base_logreg_pred))



0.7831900668576887


In [61]:
base_rf = RandomForestClassifier(random_state=42)

base_rf.fit(X_train, y_train)

base_rf_pred = base_rf.predict(X_test)

print(metrics.f1_score(y_test, base_rf_pred))

0.818007662835249


## **GridSearchCV**

### *Logistic Regression*

In [31]:
# задаю несколько параметров и их значений для перебора
param_grid = {'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'saga'],
              'C': list(np.linspace(0.01, 1, 4, dtype=float))
              }

# создаю объект GridSearchCV с логистической регрессией
grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(
        random_state=42,
        max_iter=1000
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

# обучаю, получаю предсказания и считаю метрику
grid_search_lr.fit(X_train, y_train)
grid_search_pred = grid_search_lr.predict(X_test)
print(metrics.f1_score(y_test, grid_search_pred))

0.7707129094412332


In [33]:
# смотрю модель с лучшими параметрами
print(grid_search_lr.best_estimator_)

LogisticRegression(C=0.34, max_iter=1000, random_state=42)


### *Random Forest*

In [35]:
# сетка параметров
param_grid = {
    'n_estimators' : [100, 150, 200],
    'max_depth' : [10, 20, 30],
    'min_samples_leaf' : [5, 10]
}

# объект GridSearchCV со случайным лесом
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(
        random_state=42
    ),
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

# обучаю, получаю предсказания и считаю метрику
grid_search_rf.fit(X_train, y_train)
grid_search_pred = grid_search_rf.predict(X_test)
print(metrics.f1_score(y_test, grid_search_pred))

0.8049951969260327


## **RandomizedSearchCV**

### *Logistic Regression*

In [37]:
param_distributions = {'penalty': ['l2', 'none'] ,
              'solver': ['lbfgs', 'sag'],
               'C': list(np.linspace(0.01, 1, 10, dtype=float))
               }
            
random_search_lr = RandomizedSearchCV(
    estimator=LogisticRegression(random_state=42, max_iter=1000), 
    param_distributions=param_distributions,
    cv=5,
    n_iter = 10,
    n_jobs = -1
)  
random_search_lr.fit(X_train, y_train)
random_search_lr_pred = random_search_lr.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, random_search_lr_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_lr.best_params_))

f1_score на тестовом наборе: 0.77
Наилучшие значения гиперпараметров: {'solver': 'sag', 'penalty': 'l2', 'C': 0.23}


### *Random Forest*

In [47]:
# область параметров
param_distributions = {
    'n_estimators': list(range(100, 200, 25)),
    'min_samples_leaf': [3, 5, 7],
    'max_depth': list(np.linspace(10, 30, 10, dtype=int))
    }

# объект Randomized Search
random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42), 
    param_distributions=param_distributions, 
    cv=5,
    n_iter = 10, 
    n_jobs = -1,
    random_state=42
)

# обучаю, получаю предсказания, смотрю метрики
random_search_rf.fit(X_train, y_train)
random_search_rf_pred = random_search_rf.predict(X_test)

print(metrics.f1_score(y_test, random_search_rf_pred))
print("Наилучшие значения гиперпараметров: {}".format(random_search_rf.best_params_))

KeyboardInterrupt: 

## **Hyperopt**

### *Logistic Regression*

In [66]:
# задаю пространство поиска
pen = ['l1', 'l2']
sol = ['liblinear', 'saga']
space={'penalty': hp.choice('penalty', ['l1', 'l2']), # тип регуляризации
       'solver': hp.choice('solver', ['liblinear', 'saga']), # алгоритм оптимизации
       'C': hp.uniform('C', low=0.01, high=1) # уровень силы регурялизации
      }

In [69]:
random_state = 42

# функция для обучения модели по выбранным гиперпараметрам с кросс-валидацией
def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    
    params = {'penalty': str(params['penalty']),
              'solver': str(params['solver']),
             'C': float(params['C'])
              }

    model = LogisticRegression(**params, random_state=random_state)
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [70]:
trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_lr, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

  5%|▌         | 1/20 [00:03<01:08,  3.63s/trial, best loss: -0.7770840655968769]




 15%|█▌        | 3/20 [00:19<01:45,  6.21s/trial, best loss: -0.7806799620627576]




 25%|██▌       | 5/20 [00:33<01:29,  5.99s/trial, best loss: -0.7842505969112389]




 30%|███       | 6/20 [00:42<01:38,  7.05s/trial, best loss: -0.7842505969112389]




 45%|████▌     | 9/20 [00:54<00:48,  4.40s/trial, best loss: -0.7842505969112389]




 55%|█████▌    | 11/20 [01:09<00:46,  5.22s/trial, best loss: -0.7842505969112389]




 65%|██████▌   | 13/20 [01:20<00:34,  5.00s/trial, best loss: -0.7842505969112389]




 70%|███████   | 14/20 [01:32<00:43,  7.30s/trial, best loss: -0.7842505969112389]




 80%|████████  | 16/20 [01:47<00:27,  6.77s/trial, best loss: -0.7842505969112389]




 85%|████████▌ | 17/20 [01:57<00:23,  7.80s/trial, best loss: -0.7842505969112389]




 90%|█████████ | 18/20 [02:09<00:17,  8.93s/trial, best loss: -0.7842505969112389]




100%|██████████| 20/20 [02:22<00:00,  7.15s/trial, best loss: -0.7842505969112389]
Наилучшие значения гиперпараметров {'C': 0.10567819922023905, 'penalty': 1, 'solver': 0}


In [73]:
# обучаю модель с лучшими параметрами и смотрю метрику
best_lr_model = LogisticRegression(
    random_state=random_state, 
    penalty=pen[best['penalty']],
    solver=sol[best['solver']],
    C=float(best['C'])
)

best_lr_model.fit(X_train, y_train)

best_lr_pred = best_lr_model.predict(X_test)
print(metrics.f1_score(y_test, best_lr_pred))

0.7981042654028436


### *Random Forest*

In [80]:
# задаю пространство поиска
space={'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 10, 25, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 10, 1)
      }

In [81]:
random_state = 42

# функция для обучения модели по выбранным гиперпараметрам с кросс-валидацией
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    
    params = {'n_estimators': int(params['n_estimators']),
              'max_depth': int(params['max_depth']),
             'min_samples_leaf': int(params['min_samples_leaf'])
              }

    model = RandomForestClassifier(**params, random_state=random_state)
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [82]:
trials = Trials()

best=fmin(hyperopt_rf, 
          space=space, 
          algo=tpe.suggest,
          max_evals=20,
          trials=trials,
          rstate=np.random.default_rng(random_state)
         )
print("Наилучшие значения гиперпараметров {}".format(best))

100%|██████████| 20/20 [01:15<00:00,  3.77s/trial, best loss: -0.8016372929476931]
Наилучшие значения гиперпараметров {'max_depth': 14.0, 'min_samples_leaf': 3.0, 'n_estimators': 103.0}


In [83]:
best_rf_model = RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
best_rf_model.fit(X_train, y_train)
best_rf_pred = best_rf_model.predict(X_test)
print(metrics.f1_score(y_test, best_rf_pred))


0.8122020972354624


## **Optuna**

### *Logistic Regression*

In [101]:
def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
  solver = trial.suggest_categorical('solver', ['lbfgs', 'sag'])
  C = trial.suggest_float('C', 0.01, 1, step=0.1)

  # создаем модель
  model = LogisticRegression(penalty=penalty,
                             solver=solver,
                             C=C,
                            random_state=random_state)
  # обучаем модель
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train))

  return score

In [106]:
# создаю объект исследования и ищу лучшие параметры
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")

study.optimize(optuna_rf, n_trials=20)

[32m[I 2023-05-03 15:54:49,636][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-05-03 15:54:55,448][0m Trial 0 finished with value: 0.8825806451612903 and parameters: {'penalty': 'l2', 'solver': 'sag', 'C': 0.6100000000000001}. Best is trial 0 with value: 0.8825806451612903.[0m
[32m[I 2023-05-03 15:55:00,658][0m Trial 1 finished with value: 0.8959481361426256 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.01}. Best is trial 1 with value: 0.8959481361426256.[0m
[32m[I 2023-05-03 15:55:04,834][0m Trial 2 finished with value: 0.8959481361426256 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.41000000000000003}. Best is trial 1 with value: 0.8959481361426256.[0m
[32m[I 2023-05-03 15:55:09,083][0m Trial 3 finished with value: 0.8863049095607234 and parameters: {'penalty': 'l2', 'solver': 'sag', 'C': 0.91}. Best is trial 1 with value: 0.8959481361426256.[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number 

In [105]:
model = LogisticRegression(**study.best_params,random_state=random_state)
model.fit(X_train, y_train)

print("Наилучшие значения гиперпараметров {}".format(study.best_params))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))



Наилучшие значения гиперпараметров {'penalty': 'none', 'solver': 'lbfgs', 'C': 0.11}
f1_score на тестовом наборе: 0.76


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### *Random Forest*

In [87]:
def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
  max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

  # создаем модель
  model = RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train))

  return score

In [88]:
# создаю объект исследования и ищу лучшие параметры
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")

study.optimize(optuna_rf, n_trials=20)

[32m[I 2023-05-03 14:45:51,931][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-05-03 14:45:53,875][0m Trial 0 finished with value: 0.8952134540750323 and parameters: {'n_estimators': 112, 'max_depth': 29, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8952134540750323.[0m
[32m[I 2023-05-03 14:45:55,760][0m Trial 1 finished with value: 0.9542782495101241 and parameters: {'n_estimators': 109, 'max_depth': 17, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.9542782495101241.[0m
[32m[I 2023-05-03 14:45:58,493][0m Trial 2 finished with value: 0.9905135754007196 and parameters: {'n_estimators': 154, 'max_depth': 19, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.9905135754007196.[0m
[32m[I 2023-05-03 14:46:00,254][0m Trial 3 finished with value: 0.9606247966156851 and parameters: {'n_estimators': 132, 'max_depth': 24, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.9905135754007196.[0m
[32m[I 2023-05-03 14:46:01,58

In [90]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

Наилучшие значения гиперпараметров {'n_estimators': 157, 'max_depth': 27, 'min_samples_leaf': 2}
f1_score на тестовом наборе: 0.80
