In [124]:
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import linear_model #линейные моделиё
from sklearn import tree #деревья решений
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики
from sklearn import preprocessing #предобработка
from sklearn.model_selection import train_test_split #сплитование выборки

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [125]:
data= pd.read_csv('data/train_sem09.csv')
data.head(2)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0


In [126]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 1, test_size = 0.2)

### 1. Обучаем базовые модели

In [127]:
#Создаем объект класса логистическая регрессия
log_reg = linear_model.LogisticRegression(max_iter = 1000)
#Обучаем модель, минимизируя logloss
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)
print('f1_score_Train: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print('f1_score_Test: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score_Train: 0.88
f1_score_Test: 0.78


In [128]:
#Создаем объект класса случайный лес
rf = ensemble.RandomForestClassifier(random_state=42)

#Обучаем модель
rf.fit(X_train, y_train)
#Выводим значения метрики 
y_train_pred = rf.predict(X_train)
print('F1_Train: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
y_test_pred = rf.predict(X_test)
print('F1_Test: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

F1_Train: 1.00
F1_Test: 0.81


### 2. Подбор Гиперпараметров
#### 2.1 GridSearch

In [129]:
random_state = 42

In [130]:
param_grid_lg = [
              {'penalty': ['l2', 'none'] , # тип регуляризации
              'solver': ['lbfgs', 'sag'], # алгоритм оптимизации
               'C': [0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 1]}, # уровень силы регурялизации
              
              {'penalty': ['l1', 'l2'] ,
              'solver': ['liblinear', 'saga'],
               'C': [0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 1]}
]
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=random_state, max_iter=1000), 
    param_grid=param_grid_lg, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

CPU times: total: 21.2 s
Wall time: 20min 50s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'C': 0.15, 'penalty': 'l2', 'solver': 'sag'}


In [131]:
param_grid_rf = {'n_estimators': list(range(50, 301, 10)),
              'min_samples_leaf': [3, 5, 7],
              'max_depth': list(np.linspace(5, 30, 5, dtype=int)),
              'criterion': ['gini', 'entropy']
              }
            
grid_search_forest = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_grid=param_grid_rf, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search_forest.fit(X_train, y_train) 
y_train_pred = grid_search_forest.predict(X_train)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
y_test_pred = grid_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_forest.best_params_))

CPU times: total: 20.9 s
Wall time: 27min 48s
f1_score на обучающем наборе: 0.98
f1_score на тестовом наборе: 0.83
Наилучшие значения гиперпараметров: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 3, 'n_estimators': 240}


#### 2.2 RandomSearch

In [132]:
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=random_state, max_iter=1000), 
    param_distributions=param_grid_lg, 
    cv=5, 
    n_iter = 40, 
    n_jobs = -1
)  
%time random_search.fit(X_train, y_train) 
y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

CPU times: total: 19.4 s
Wall time: 13min 27s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'solver': 'sag', 'penalty': 'l2', 'C': 0.15}


In [133]:
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_distributions=param_grid_rf, 
    cv=5,
    n_iter = 40, 
    n_jobs = -1
)  
%time random_search_forest.fit(X_train, y_train) 
y_train_pred = random_search_forest.predict(X_train)
y_test_pred = random_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

CPU times: total: 5.09 s
Wall time: 1min 21s
f1_score на тестовом наборе: 0.83
Наилучшие значения гиперпараметров: {'n_estimators': 250, 'min_samples_leaf': 3, 'max_depth': 30, 'criterion': 'entropy'}


### 3. Hyperopt

In [134]:
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
from sklearn.model_selection import cross_val_score

In [135]:
# зададим пространство поиска гиперпараметров
space_rf={'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
       'max_depth' : hp.quniform('max_depth', 5, 40, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 7, 1),
       'criterion': hp.choice('criterion',['gini', 'entropy'])
      }


In [136]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf']),
             'criterion': str(params['criterion'])
              }
  
    # используем эту комбинацию для построения модели
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    # обучаем модель
    model.fit(X, y)
    #score = metrics.f1_score(y, model.predict(X))
    # обучать модель можно также с помощью кросс-валидации
    # применим  cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()
    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [137]:
trials = Trials() # используется для логирования результатов

best_rf=fmin(hyperopt_rf, # наша функция 
          space=space_rf, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=40, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best_rf))

100%|██████████| 40/40 [04:35<00:00,  6.89s/trial, best loss: -0.81390244709293]  
Наилучшие значения гиперпараметров {'criterion': 1, 'max_depth': 36.0, 'min_samples_leaf': 3.0, 'n_estimators': 250.0}


In [138]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best_rf['n_estimators']),
    max_depth=int(best_rf['max_depth']),
    min_samples_leaf=int(best_rf['min_samples_leaf'])
)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.98
f1_score на тестовом наборе: 0.83


### 4.  OPTUNA

#### 4.1 Подбор параметров для случайного леса

In [139]:
import optuna

In [165]:
def optuna_rf(trial, cv = 5):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 50, 300, 10)
  max_depth = trial.suggest_int('max_depth', 5, 40, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 7, 1)
  criterion=trial.suggest_categorical('criterion', ['gini', 'entropy'])

  # создаем модель
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          criterion=criterion,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_train, y_train)
  #score = metrics.f1_score(y_train, model.predict(X_train))
  score = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1).mean()

  return score

In [166]:
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=40)

[32m[I 2023-04-09 23:33:32,607][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-04-09 23:33:39,146][0m Trial 0 finished with value: 0.7774768319124269 and parameters: {'n_estimators': 120, 'max_depth': 6, 'min_samples_leaf': 2, 'criterion': 'gini'}. Best is trial 0 with value: 0.7774768319124269.[0m
[32m[I 2023-04-09 23:33:46,353][0m Trial 1 finished with value: 0.8063117595812116 and parameters: {'n_estimators': 180, 'max_depth': 28, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 1 with value: 0.8063117595812116.[0m
[32m[I 2023-04-09 23:33:50,457][0m Trial 2 finished with value: 0.797435293697329 and parameters: {'n_estimators': 130, 'max_depth': 16, 'min_samples_leaf': 6, 'criterion': 'gini'}. Best is trial 1 with value: 0.8063117595812116.[0m
[32m[I 2023-04-09 23:33:55,871][0m Trial 3 finished with value: 0.7982751045577687 and parameters: {'n_estimators': 160, 'max_depth': 14, 'min_samples_leaf': 6, 'criterion': 'entropy'}

In [167]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'n_estimators': 300, 'max_depth': 28, 'min_samples_leaf': 2, 'criterion': 'entropy'}
f1_score на обучающем наборе: 0.82


In [169]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params,random_state=random_state, )
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тренировочном наборе: 0.99
f1_score на тестовом наборе: 0.82


In [170]:
optuna.visualization.plot_optimization_history(study, target_name="f1_score")

In [171]:
optuna.visualization.plot_param_importances(study, target_name="f1_score")

In [172]:
optuna.visualization.plot_contour(study, params=["max_depth", "n_estimators"],
                                  target_name="f1_score")

#### 4.2 Подбор параметров для логистрической регрессии

In [147]:
def optuna_lg(trial, cv = 5):
  # задаем пространства поиска гиперпараметров
  c = trial.suggest_float('C', 0.001, 1)
  penalty = trial.suggest_categorical('penalty', ['l2', None])
  solver=trial.suggest_categorical('solver', ['lbfgs', 'sag', 'saga'])

  # создаем модель
  model_lg =linear_model.LogisticRegression(random_state=random_state,
                                            penalty=penalty,
                                            solver=solver,
                                            C=c,
                                            max_iter=1000)
  
  # обучаем модель
  model_lg.fit(X_train, y_train)
  #score = metrics.f1_score(y_train, model.predict(X_train))
  score = cross_val_score(model_lg, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1).mean()

  return score

In [148]:
study = optuna.create_study(study_name='LogisticRegression', direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lg, n_trials=30)

[32m[I 2023-04-09 20:38:34,696][0m A new study created in memory with name: LogisticRegression[0m

Setting penalty=None will ignore the C and l1_ratio parameters


The max_iter was reached which means the coef_ did not converge

[32m[I 2023-04-09 20:40:38,946][0m Trial 0 finished with value: 0.7546660089494158 and parameters: {'C': 0.5439302984856175, 'penalty': None, 'solver': 'saga'}. Best is trial 0 with value: 0.7546660089494158.[0m
[32m[I 2023-04-09 20:41:36,216][0m Trial 1 finished with value: 0.7738862369975371 and parameters: {'C': 0.532352524279976, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 1 with value: 0.7738862369975371.[0m
[32m[I 2023-04-09 20:42:59,573][0m Trial 2 finished with value: 0.7693500736164901 and parameters: {'C': 0.8893675722256887, 'penalty': 'l2', 'solver': 'saga'}. Best is trial 1 with value: 0.7738862369975371.[0m

Setting penalty=None will ignore the C and l1_ratio parameters


The max_iter was reached which means the coef_ did not con

In [149]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'C': 0.07748891910021997, 'penalty': 'l2', 'solver': 'sag'}
f1_score на обучающем наборе: 0.78


In [150]:
#bp = {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
# рассчитаем точность для тестовой выборки
model_lg = linear_model.LogisticRegression(**study.best_params,random_state=random_state, max_iter=1000 )
#model_lg = linear_model.LogisticRegression(**bp,random_state=random_state, max_iter=1000)
model_lg.fit(X_train, y_train)
y_train_pred = model_lg.predict(X_train)
y_test_pred = model_lg.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.80


In [151]:
optuna.visualization.plot_optimization_history(study, target_name="f1_score")

In [152]:
optuna.visualization.plot_param_importances(study, target_name="f1_score")

In [153]:
optuna.visualization.plot_contour(study, params=["solver", 'C'],
                                  target_name="f1_score")