In [1]:
import mlflow
import os
import pandas as pd

from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
os.getenv("MLFLOW_TRACKING_URI", "No env")

'http://mlflow-service:5000'

In [3]:
mlflow.get_registry_uri()

'http://mlflow-service:5000'

# Настройки эксперимента

In [4]:
experiment_name = "denis_chuzhmarov"

experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlops-hw/mlflow/516513222953534560', creation_time=1737633585696, experiment_id='516513222953534560', last_update_time=1737633585696, lifecycle_stage='active', name='denis_chuzhmarov', tags={}>

In [5]:
mlflow.search_experiments()

[<Experiment: artifact_location='s3://mlops-hw/mlflow/959716346601234858', creation_time=1737674264999, experiment_id='959716346601234858', last_update_time=1737674264999, lifecycle_stage='active', name='DenisChuzhmarov', tags={}>,
 <Experiment: artifact_location='s3://mlops-hw/mlflow/516513222953534560', creation_time=1737633585696, experiment_id='516513222953534560', last_update_time=1737633585696, lifecycle_stage='active', name='denis_chuzhmarov', tags={}>,
 <Experiment: artifact_location='s3://mlops-hw/mlflow/0', creation_time=1737633496051, experiment_id='0', last_update_time=1737633496051, lifecycle_stage='active', name='Default', tags={}>]

# Подготовка данных

In [6]:
# Прочитаем данные.
housing = fetch_california_housing(as_frame=True)

In [7]:
data = pd.concat([housing.data, housing.target.rename("MedHouseVal")], axis=1)

FEATURES = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms",
    "Population", "AveOccup", "Latitude", "Longitude"
]

TARGET = "MedHouseVal"

X = data[FEATURES]
y = data[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# X_val_scaled = scaler.transform(X_val)

In [8]:
X_test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
8875,13.8556,52.0,8.948148,1.077778,810.0,3.0,34.05,-118.49
9975,3.0938,34.0,5.424569,1.040948,1324.0,2.853448,38.58,-122.45
17493,9.7821,28.0,8.212871,1.091584,1604.0,3.970297,34.43,-119.77
4602,1.2012,12.0,1.465753,0.89863,1194.0,3.271233,34.05,-118.27
18010,5.9658,17.0,5.873077,1.026923,775.0,2.980769,37.27,-121.99


# Запуск эксперимента

In [9]:
models = dict(zip(["RandomForest", "LinearRegression", "DecisionTree"], 
                  [RandomForestRegressor(), LinearRegression(), DecisionTreeRegressor()]))

with mlflow.start_run(run_name="@DenisChuzhmarov") as parent_run:
    parent_run_id = parent_run.info.run_id
    for model_name in models.keys():
        with mlflow.start_run(run_name=model_name, nested=True) as child_run:
            model = models[model_name]
            model.fit(X_train, y_train)
            prediction = model.predict(X_val)

            # X_val_scaled = pd.DataFrame(
            # X_val_scaled, 
            # columns=[f"col_{i}" for i in range(X_val_scaled.shape[1])]).reset_index(drop=True)
            # y_val = y_val.reset_index(drop=True)

            # # Создание итогового eval_df
            # eval_df = X_val_scaled.copy()
            # eval_df["target"] = y_val
            # eval_df["prediction"] = prediction

            eval_df = X_val.copy()
            eval_df["target"] = y_val
            eval_df["prediction"] = prediction
    
            signature = infer_signature(X_train, prediction)
            try:
                model_info = mlflow.sklearn.log_model(model, model_name, signature=signature, registered_model_name=f"sk-learn-{model_name}-reg-model")
            except Exception as e:
                print(f"Я пытался заставить mlflow сохранять артефакты на s3 в течение долго времени, но у меня не получилось :(. Ошибка: {e}")
            
            mlflow.evaluate(
                data=eval_df,
                targets="target",
                predictions="prediction",
                model_type="regressor",
                evaluators=["default"]
            )

2025/01/24 13:17:56 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/01/24 13:17:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: http://mlflow-service:5000/#/experiments/516513222953534560/runs/3752f7fed4484679ab9ab653d0fd0171.
2025/01/24 13:17:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/516513222953534560.


Я пытался заставить mlflow сохранять артефакты на s3 в течение долго времени, но у меня не получилось :(. Ошибка: Failed to upload /tmp/tmp3vfxtcvc/model/conda.yaml to mlops-hw/mlflow/516513222953534560/3752f7fed4484679ab9ab653d0fd0171/artifacts/RandomForest/conda.yaml: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.


2025/01/24 13:17:57 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/01/24 13:17:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression at: http://mlflow-service:5000/#/experiments/516513222953534560/runs/e98468d6d2f841279826d37821ce3887.
2025/01/24 13:17:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/516513222953534560.


Я пытался заставить mlflow сохранять артефакты на s3 в течение долго времени, но у меня не получилось :(. Ошибка: Failed to upload /tmp/tmpve149pb2/model/conda.yaml to mlops-hw/mlflow/516513222953534560/e98468d6d2f841279826d37821ce3887/artifacts/LinearRegression/conda.yaml: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.


2025/01/24 13:17:58 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/01/24 13:17:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run DecisionTree at: http://mlflow-service:5000/#/experiments/516513222953534560/runs/aa323b08e51e43f39cc32549ebedb83f.
2025/01/24 13:17:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/516513222953534560.
2025/01/24 13:17:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run @DenisChuzhmarov at: http://mlflow-service:5000/#/experiments/516513222953534560/runs/686b92520d044fc3b22f4d2513c76ca4.
2025/01/24 13:17:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/516513222953534560.


Я пытался заставить mlflow сохранять артефакты на s3 в течение долго времени, но у меня не получилось :(. Ошибка: Failed to upload /tmp/tmp66msnhe1/model/conda.yaml to mlops-hw/mlflow/516513222953534560/aa323b08e51e43f39cc32549ebedb83f/artifacts/DecisionTree/conda.yaml: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.
