In [1]:
import boto3
import pandas as pd
import mlflow
import mlflow.sklearn
import os
import mlflow.sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp
import mlflow

In [2]:
# Инициализация клиента
print('Инициализация клиента...')
s3 = boto3.client('s3',
                  endpoint_url='http://localhost:9000',
                  aws_access_key_id='minio',
                  aws_secret_access_key='minio123')

Инициализация клиента...


In [3]:
# Считывание данных
print('Считывание данных...')
obj = s3.get_object(Bucket='datasets', Key='kinopoisk_train.csv')
data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))

Считывание данных...


In [4]:
# Установка переменных окружения в Windows
os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2)

# Определение пространства поиска
space = {
    'preprocessing': hp.choice('preprocessing', [None, 'remove_stopwords']),
    'vectorizer': hp.choice('vectorizer', ['CountVectorizer', 'TfidfVectorizer']),
    'model': hp.choice('model', ['LogisticRegression', 'RandomForest']),
    'C': hp.uniform('C', 0.1, 10.0)
}

In [6]:
def objective(params):
    # Извлечение параметров из пространства поиска
    preprocessing = params['preprocessing']
    vectorizer_type = params['vectorizer']
    model_type = params['model']
    C = params['C']

   # Применение выбранных методов предобработки данных, векторизации и модели классификации
    if preprocessing == 'remove_stopwords':
        pass

    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer()
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer()

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    if model_type == 'LogisticRegression':
        clf = LogisticRegression(C=C)
    elif model_type == 'RandomForest':
        clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Обучение модели и вычисление метрик
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    print('Точность модели:', accuracy)

    # Логирование в MLflow
    with mlflow.start_run() as run:
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(clf, "model", registered_model_name="OptimizedModel")

    # Возвращение целевой метрики, которую Hyperopt будет оптимизировать
    return -accuracy

# Оптимизация гиперпараметров с использованием Hyperopt
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=25)

Точность модели:                                                                                                       
0.7099468488990129                                                                                                     
  0%|                                                                           | 0/25 [01:11<?, ?trial/s, best loss=?]


Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:12:35 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 44



  4%|█▉                                              | 1/25 [01:20<32:03, 80.17s/trial, best loss: -0.7099468488990129]

Created version '44' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7757529739306505                                                                                                     
  4%|█▉                                              | 1/25 [01:45<32:03, 80.17s/trial, best loss: -0.7099468488990129]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:13:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 45



  8%|███▊                                            | 2/25 [01:49<19:15, 50.24s/trial, best loss: -0.7757529739306505]

Created version '45' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7099468488990129                                                                                                     
  8%|███▊                                            | 2/25 [03:01<19:15, 50.24s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:14:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 46



 12%|█████▊                                          | 3/25 [03:08<23:13, 63.35s/trial, best loss: -0.7757529739306505]

Created version '46' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7557580359402682                                                                                                     
 12%|█████▊                                          | 3/25 [03:33<23:13, 63.35s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:14:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 47



 16%|███████▋                                        | 4/25 [03:39<17:39, 50.44s/trial, best loss: -0.7757529739306505]

Created version '47' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7552518349784865                                                                                                     
 16%|███████▋                                        | 4/25 [04:03<17:39, 50.44s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:15:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 48



 20%|█████████▌                                      | 5/25 [04:07<14:13, 42.69s/trial, best loss: -0.7757529739306505]

Created version '48' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7099468488990129                                                                                                     
 20%|█████████▌                                      | 5/25 [05:15<14:13, 42.69s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:16:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 49



 24%|███████████▌                                    | 6/25 [05:21<16:48, 53.07s/trial, best loss: -0.7757529739306505]

Created version '49' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7096937484181219                                                                                                     
 24%|███████████▌                                    | 6/25 [06:35<16:48, 53.07s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:17:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 50



 28%|█████████████▍                                  | 7/25 [06:41<18:33, 61.86s/trial, best loss: -0.7757529739306505]

Created version '50' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7570235383447228                                                                                                     
 28%|█████████████▍                                  | 7/25 [07:05<18:33, 61.86s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:18:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 51



 32%|███████████████▎                                | 8/25 [07:10<14:34, 51.45s/trial, best loss: -0.7757529739306505]

Created version '51' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7410782080485953                                                                                                     
 32%|███████████████▎                                | 8/25 [07:32<14:34, 51.45s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:18:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 52



 36%|█████████████████▎                              | 9/25 [07:36<11:38, 43.67s/trial, best loss: -0.7757529739306505]

Created version '52' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7223487724626677                                                                                                     
 36%|█████████████████▎                              | 9/25 [07:56<11:38, 43.67s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:19:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 53



 40%|██████████████████▊                            | 10/25 [08:00<09:22, 37.50s/trial, best loss: -0.7757529739306505]

Created version '53' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7096937484181219                                                                                                     
 40%|██████████████████▊                            | 10/25 [09:13<09:22, 37.50s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:20:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 54



 44%|████████████████████▋                          | 11/25 [09:21<11:50, 50.73s/trial, best loss: -0.7757529739306505]

Created version '54' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7544925335358137                                                                                                     
 44%|████████████████████▋                          | 11/25 [09:46<11:50, 50.73s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:21:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 55



 48%|██████████████████████▌                        | 12/25 [09:51<09:37, 44.45s/trial, best loss: -0.7757529739306505]

Created version '55' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7099468488990129                                                                                                     
 48%|██████████████████████▌                        | 12/25 [10:56<09:37, 44.45s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:22:17 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 56



 52%|████████████████████████▍                      | 13/25 [11:01<10:27, 52.31s/trial, best loss: -0.7757529739306505]

Created version '56' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7099468488990129                                                                                                     
 52%|████████████████████████▍                      | 13/25 [12:07<10:27, 52.31s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:23:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 57



 56%|██████████████████████████▎                    | 14/25 [12:13<10:39, 58.13s/trial, best loss: -0.7757529739306505]

Created version '57' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7096937484181219                                                                                                     
 56%|██████████████████████████▎                    | 14/25 [13:23<10:39, 58.13s/trial, best loss: -0.7757529739306505]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:24:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 58



 60%|████████████████████████████▏                  | 15/25 [13:28<10:34, 63.40s/trial, best loss: -0.7757529739306505]

Created version '58' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7567704378638319                                                                                                     
 60%|████████████████████████████▏                  | 15/25 [13:53<10:34, 63.40s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:25:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 59



 64%|██████████████████████████████                 | 16/25 [13:57<07:55, 52.79s/trial, best loss: -0.7757529739306505]

Created version '59' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7760060744115413                                                                                                     
 64%|██████████████████████████████                 | 16/25 [14:19<07:55, 52.79s/trial, best loss: -0.7757529739306505]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:25:39 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 60



 68%|███████████████████████████████▉               | 17/25 [14:23<05:59, 44.89s/trial, best loss: -0.7760060744115413]

Created version '60' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7096937484181219                                                                                                     
 68%|███████████████████████████████▉               | 17/25 [15:35<05:59, 44.89s/trial, best loss: -0.7760060744115413]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:26:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 61



 72%|█████████████████████████████████▊             | 18/25 [15:41<06:22, 54.66s/trial, best loss: -0.7760060744115413]

Created version '61' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7777777777777778                                                                                                     
 72%|█████████████████████████████████▊             | 18/25 [16:05<06:22, 54.66s/trial, best loss: -0.7760060744115413]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:27:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 62



 76%|███████████████████████████████████▋           | 19/25 [16:08<04:39, 46.62s/trial, best loss: -0.7777777777777778]

Created version '62' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7096937484181219                                                                                                     
 76%|███████████████████████████████████▋           | 19/25 [17:24<04:39, 46.62s/trial, best loss: -0.7777777777777778]

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:28:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 63



 80%|█████████████████████████████████████▌         | 20/25 [17:31<04:46, 57.35s/trial, best loss: -0.7777777777777778]

Created version '63' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7765122753733232                                                                                                     
 80%|█████████████████████████████████████▌         | 20/25 [17:53<04:46, 57.35s/trial, best loss: -0.7777777777777778]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:29:13 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 64



 84%|███████████████████████████████████████▍       | 21/25 [17:58<03:12, 48.22s/trial, best loss: -0.7777777777777778]

Created version '64' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7762591748924323                                                                                                     
 84%|███████████████████████████████████████▍       | 21/25 [18:20<03:12, 48.22s/trial, best loss: -0.7777777777777778]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:29:40 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 65



 88%|█████████████████████████████████████████▎     | 22/25 [18:24<02:05, 41.75s/trial, best loss: -0.7777777777777778]

Created version '65' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.777018476335105                                                                                                      
 88%|█████████████████████████████████████████▎     | 22/25 [18:48<02:05, 41.75s/trial, best loss: -0.7777777777777778]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:30:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 66



 92%|███████████████████████████████████████████▏   | 23/25 [18:52<01:15, 37.56s/trial, best loss: -0.7777777777777778]

Created version '66' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.777018476335105                                                                                                      
 92%|███████████████████████████████████████████▏   | 23/25 [19:15<01:15, 37.56s/trial, best loss: -0.7777777777777778]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:30:35 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 67



 96%|█████████████████████████████████████████████  | 24/25 [19:20<00:34, 34.53s/trial, best loss: -0.7777777777777778]

Created version '67' of model 'OptimizedModel'.


Точность модели:                                                                                                       
0.7772715768159959                                                                                                     
 96%|█████████████████████████████████████████████  | 24/25 [19:43<00:34, 34.53s/trial, best loss: -0.7777777777777778]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Registered model 'OptimizedModel' already exists. Creating a new version of this model...
2024/01/21 14:31:03 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: OptimizedModel, version 68



100%|███████████████████████████████████████████████| 25/25 [19:48<00:00, 47.53s/trial, best loss: -0.7777777777777778]


Created version '68' of model 'OptimizedModel'.
