### Ноутбук для отладки пайплайна

In [34]:
import os
import numpy as np
import pandas as pd

# ML / обучение
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# MLOps
import mlflow
import mlflow.sklearn
import joblib

# проверка данных
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import data_integrity

# анализ дрейфа
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# API
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

RANDOM_STATE = 42

In [35]:
# загрузка исходного датасета Iris
# переименование целевой колонки в 'label'.

iris = load_iris(as_frame=True)
df = iris.frame
df = df.rename(columns={"target": "label"})

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [36]:
# анализ качества исходного датасета Iris с помощью Deepchecks.
# используем весь df без дополнительных копий.

train_ds = Dataset(
    df.drop(columns=["label"]),
    label=df["label"],
    cat_features=[]  # во всех признаках Iris — числовые значения
)

suite = data_integrity()
result = suite.run(train_ds)

os.makedirs("reports", exist_ok=True)
dc_report_path = os.path.join("reports", "deepchecks_report.html")
result.save_as_html(dc_report_path)

print("Data Integrity Suite:")
print(f"Deepchecks report saved to {dc_report_path}")

Data Integrity Suite:
Deepchecks report saved to reports/deepchecks_report.html


In [37]:
def evidently_analysis(df):
    # анализ дрейфа данных с EvidentlyAI
    # разделение данных на референсные и текущие (70/30)
    reference_data = df.sample(frac=0.7, random_state=42)
    current_data = df.drop(reference_data.index)

    # создание отчёта с пресетом дрейфа данных
    report = Report(metrics=[DataDriftPreset()])
    report.run(
        reference_data=reference_data,
        current_data=current_data
    )

    # сохранение HTML-отчёта
    os.makedirs("reports", exist_ok=True)
    output_path = "reports/evidently_report.html"
    report.save_html(output_path)

    print(f"EvidentlyAI отчёт сохранён в {output_path}")
    return report


# запуск анализа дрейфа
evidently_report = evidently_analysis(df)

EvidentlyAI отчёт сохранён в reports/evidently_report.html


In [38]:
def mlflow_experiment(df):
    # запуск MLflow-эксперимента
    np.random.seed(RANDOM_STATE)

    # подготовка данных
    X = df.drop(columns=["label"])
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=y,
    )

    # настройка MLflow с проверкой директорий
    tracking_uri_env = os.getenv("MLFLOW_TRACKING_URI")
    if tracking_uri_env:
        mlflow.set_tracking_uri(tracking_uri_env)
    else:
        mlruns_root = os.path.abspath("./mlruns")
        trash_dir = os.path.join(mlruns_root, ".trash")

        # если существует и не директория — ошибка конфигурации
        if os.path.exists(mlruns_root) and not os.path.isdir(mlruns_root):
            raise RuntimeError(
                f"Путь {mlruns_root} существует и не является директорией. "
                f"Удалите/переименуйте этот файл."
            )

        os.makedirs(mlruns_root, exist_ok=True)
        os.makedirs(trash_dir, exist_ok=True)

        mlflow.set_tracking_uri(f"file:{mlruns_root}")

    mlflow.set_experiment("iris_hw5")

    with mlflow.start_run(run_name="rf_iris_baseline"):
        params = {
            "n_estimators": 100,
            "max_depth": 10,
            "random_state": RANDOM_STATE,
        }

        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        # логирование гиперпараметров и метрик
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_macro", f1)

        # пример входных данных для UI
        input_example = X_test.iloc[:5]

        # логирование модели в MLflow
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            input_example=input_example,
        )

        # локальный артефакт модели
        os.makedirs("artifacts", exist_ok=True)
        model_path = "artifacts/model.pkl"
        joblib.dump(model, model_path)

        # логирование pkl
        mlflow.log_artifact(model_path)

        print(f"Accuracy: {acc:.4f}, F1_macro: {f1:.4f}")
        print("Run id:", mlflow.active_run().info.run_id)

    return model, acc, f1


# запуск эксперимента
model, acc, f1 = mlflow_experiment(df)


Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Accuracy: 0.9000, F1_macro: 0.8997
Run id: 1f0e55a924114711aa0b2b2b34cdff42


Всё работает!