In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

%matplotlib inline

In [103]:
data = pd.read_csv('_train_sem09.csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [104]:
y = data['Activity']
X = data.drop(['Activity'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

## 1. LogisticRegression

In [139]:
log_reg = linear_model.LogisticRegression(random_state = 42, max_iter = 1000)
log_reg.fit(X_train, y_train)

y_test_pred = log_reg.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.789


### 1.1 GridSearchCV

In [115]:
param_grid = {'penalty': ['l2', 'None'], 
              'solver': ['lbfgs', 'sag'],
              'C': [0.01, 0.5, 0.9]}

grid_search = GridSearchCV(estimator = linear_model.LogisticRegression(random_state = 42, max_iter = 1000),
                           param_grid = param_grid, 
                           cv = 5)

grid_search.fit(X_train, y_train)

In [117]:
y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

f1_score на тестовом наборе: 0.780
Наилучшие значения гиперпараметров: {'C': 0.5, 'penalty': 'l2', 'solver': 'sag'}


### 1.2 RandomizedSearchCV

In [118]:
param_distributions = {'penalty': ['l2', 'None'], 
                       'solver': ['lbfgs', 'sag'], 
                       'C': list(np.linspace(0.01, 1, 10, dtype=float))}

random_search = RandomizedSearchCV(estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000),
                                   param_distributions = param_distributions, 
                                   cv = 5, 
                                   n_iter = 10)

random_search.fit(X_train, y_train)

In [120]:
y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

f1_score на тестовом наборе: 0.794
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.23}


### 1.3 Hyperopt

In [140]:
penalty_list = ['l2', 'none']
solver_list = ['lbfgs', 'sag']
space = {'penalty': hp.choice(label='penalty', options=penalty_list),
         'solver': hp.choice(label='solver', options=solver_list),
         'C': hp.uniform('C', 0.01, 1)}
random_state = 42

def hyperopt (params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {'penalty': params['penalty'], 
              'solver': params['solver'], 
              'C': float(params['C'])}
    model = linear_model.LogisticRegression(**params, random_state=random_state, max_iter=1000)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1").mean()
    return -score

In [None]:
trials = Trials()
best = fmin(hyperopt, space=space, max_evals=10, trials=trials, rstate=np.random.default_rng(random_state))

In [142]:
print(f'Наилучшие значения гиперпараметров {best}')

Наилучшие значения гиперпараметров {'C': 0.08292441519601887, 'penalty': 0, 'solver': 0}


In [143]:
model = linear_model.LogisticRegression(
    max_iter = 1000,
    random_state = random_state, 
    penalty=penalty_list[best['penalty']],
    solver=solver_list[best['solver']],
    C=float(best['C']))

model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.794


### 1.4 Optuna

In [154]:
random_state = 42

def optuna_lg(trial):
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'sag'])
    C = trial.suggest_float('C', 0.01, 1)

    model = linear_model.LogisticRegression(penalty = penalty,
                                            solver = solver,
                                            C = C,
                                            random_state = random_state)
    model.fit(X_train, y_train)
    score = metrics.f1_score(y_train, model.predict(X_train))

    return score

In [None]:
study = optuna.create_study(study_name = 'LogisticRegression', direction = 'maximize')
study.optimize(optuna_lg, n_trials = 20)

In [156]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))

Наилучшие значения гиперпараметров {'penalty': 'none', 'solver': 'lbfgs', 'C': 0.2599248200510233}


In [None]:
model = linear_model.LogisticRegression(**study.best_params, random_state=random_state, max_iter = 1000)
model.fit(X_train, y_train)

In [158]:
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.726


## 2. RandomForest

In [159]:
rf = ensemble.RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.828


### 2.1 GridSearchCV

In [162]:
param_grid = {'n_estimators': list(range(80, 200, 30)),
              'min_samples_leaf': [5],
              'max_depth': list(np.linspace(20, 40, 5, dtype=int))}
            
grid_search_forest = GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),
                                  param_grid = param_grid,
                                  cv = 5)

In [163]:
grid_search_forest.fit(X_train, y_train) 

In [164]:
y_test_pred = grid_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_forest.best_params_))

f1_score на тестовом наборе: 0.835
Наилучшие значения гиперпараметров: {'max_depth': 30, 'min_samples_leaf': 5, 'n_estimators': 140}


### 2.2 RandomizedSearchCV

In [165]:
param_distributions = {'n_estimators': list(range(80, 200, 30)),
                       'min_samples_leaf': [5],
                       'max_depth': list(np.linspace(20, 40, 10, dtype=int))}
            
random_search_forest = RandomizedSearchCV(estimator=ensemble.RandomForestClassifier(random_state=42),
                                          param_distributions = param_distributions,
                                          cv = 5,
                                          n_iter = 10)

In [166]:
random_search_forest.fit(X_train, y_train)

In [167]:
y_test_pred = random_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

f1_score на тестовом наборе: 0.839
Наилучшие значения гиперпараметров: {'n_estimators': 170, 'min_samples_leaf': 5, 'max_depth': 22}


### 2.3 Hyperopt

In [176]:
space={'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 10, 30, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)}
random_state = 42

def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])}

    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1").mean()
    return -score

In [None]:
trials = Trials()
best = fmin(hyperopt_rf, space=space, max_evals=10, trials=trials, rstate=np.random.default_rng(random_state))

In [178]:
print(f'Наилучшие значения гиперпараметров {best}')

Наилучшие значения гиперпараметров {'max_depth': 16.0, 'min_samples_leaf': 3.0, 'n_estimators': 150.0}


In [179]:
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf']))

model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.831


### 2.4 Optuna

In [168]:
random_state = 42

def optuna_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
    max_depth = trial.suggest_int('max_depth', 10, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

    model = ensemble.RandomForestClassifier(n_estimators = n_estimators,
                                            max_depth = max_depth,
                                            min_samples_leaf = min_samples_leaf,
                                            random_state = random_state)
    
    model.fit(X_train, y_train)
    score = metrics.f1_score(y_train, model.predict(X_train))
    
    return score

In [None]:
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_rf, n_trials=20)

In [180]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))

Наилучшие значения гиперпараметров {'n_estimators': 102, 'max_depth': 29, 'min_samples_leaf': 2}


In [171]:
model = ensemble.RandomForestClassifier(**study.best_params, random_state=random_state)
model.fit(X_train, y_train)

In [172]:
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.3f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.825


## Выводы

1. Для модели LogisticRegression лучшие показатели f1=0.794 у RandomizedSearchCV и Hyperopt. Стоит отметить, что гиперпараметры penalty и solver у них совпадают, то C отличается (0.08 и 0.23, соответственно)

2. Для модели RandomForest лучший показатель у RandomizedSearchCV - f1=0.839.