# Mimick running mlflow server 
* experiemnt data in PostGres
* artifacts on S3 bucket (mimicked by Minio)

1. install necessary librairies

In [None]:
%%bash
pip install mlflow psycopg2 boto3

2. setup env variable for mlflow to use S3 mimick build with docker

* Key Differences: FakeS3 / AWS S3

| Component       | MinIO (Local)                                | Real S3 (AWS)                  |
|-----------------|----------------------------------------------|--------------------------------|
| **Endpoint**    | `MLFLOW_S3_ENDPOINT_URL=http://localhost:9000` | Not needed (uses AWS default)  |
| **Access Key**  | `AWS_ACCESS_KEY_ID=minio_user`               | `AWS_ACCESS_KEY_ID=your_aws_key` |
| **Secret Key**  | `AWS_SECRET_ACCESS_KEY=minio_password`       | `AWS_SECRET_ACCESS_KEY=your_aws_secret` |
| **Region**      | Any (MinIO ignores it)                       | Must be your actual AWS region |
| **Buckets**     | `s3://s3mimick` (created by `mc` command)    | `s3://your-existing-bucket`    |
| **Docker Services** | PostgreSQL + MinIO + `mc`                | PostgreSQL only                |


In [3]:
%%bash
# necessary because bash doesnt necessarely knows where is .env for mlflow to connect to AWS
export AWS_ACCESS_KEY_ID=minio_user
export AWS_SECRET_ACCESS_KEY=minio_password
export AWS_DEFAULT_REGION=us-east-1
# only necessary if using minio , AWS set it up automatically
export AWS_ENDPOINT_URL=http://localhost:9000


In [1]:
%%bash 

echo ${AWS_ACCESS_KEY_ID}
echo ${AWS_SECRET_ACCESS_KEY}
echo ${AWS_DEFAULT_REGION}
echo ${AWS_ENDPOINT_URL}

minio_user
minio_password
us-east-1
http://localhost:9000


3. set-up remote data stores (conf. in .yaml ; postgres for metatdata & minio for artifacts)



In [1]:
%%bash 
# dockerfile i sable to use the .env file
docker compose -f compose_mimick_S3.yaml --env-file .env up -d

 Network end-to-end-ml-pipeline_default  Creating
 Network end-to-end-ml-pipeline_default  Created
 Container end-to-end-ml-pipeline-minio-1  Creating
 Container end-to-end-ml-pipeline-postgres-1  Creating
 Container end-to-end-ml-pipeline-minio-1  Created
 Container end-to-end-ml-pipeline-minio-create-s3_mimick-1  Creating
 Container end-to-end-ml-pipeline-postgres-1  Created
 Container end-to-end-ml-pipeline-minio-create-s3_mimick-1  Created
 Container end-to-end-ml-pipeline-postgres-1  Starting
 Container end-to-end-ml-pipeline-minio-1  Starting
 Container end-to-end-ml-pipeline-minio-1  Started
 Container end-to-end-ml-pipeline-minio-1  Waiting
 Container end-to-end-ml-pipeline-postgres-1  Started
 Container end-to-end-ml-pipeline-minio-1  Healthy
 Container end-to-end-ml-pipeline-minio-create-s3_mimick-1  Starting
 Container end-to-end-ml-pipeline-minio-create-s3_mimick-1  Started


4. start the mlflow server BUCKET location as artifact destination


In [3]:
%%bash
echo ${MIMICK_S3_BUCKET}
echo ${MIMICK_POSTGRES_USER}
echo ${MIMICK_POSTGRES_PASSWORD}
echo ${MIMICK_POSTGRES_DATABASE}

s3://s3mimick
postgres_user
postgres_password
mlflowdb


In [2]:
%%bash 

# release the port for mlflow (error of port used not seen from notebook)
lsof -ti:5005 | xargs kill -9

# start mlflow server
mlflow server \
    --backend-store-uri postgresql://${MIMICK_POSTGRES_USER}:${MIMICK_POSTGRES_PASSWORD}@localhost:5432/${MIMICK_POSTGRES_DATABASE} \
        --artifacts-destination ${MIMICK_S3_BUCKET} \
            --host 0.0.0.0 --port 5005  \
                --gunicorn-opts "--daemon"  
# gunicorn needed to run following cells in a notebook              

2025/09/28 15:13:29 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/28 15:13:29 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl PostgresqlImpl.
INFO  [alembic.runtime.migration] Will assume transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [3]:
# check mlflow is up and running
import requests
requests.get("http://localhost:5005/health")

<Response [200]>

5. log to mlflow server

In [4]:

# import os
# os.environ['AWS_ACCESS_KEY_ID'] = "minio_user"
# os.environ['AWS_SECRET_ACCESS_KEY'] = "minio_password" 
# os.environ['AWS_DEFAULT_REGION'] = "us-east-1"
# os.environ['AWS_ENDPOINT_URL'] = "http://localhost:9000"


import mlflow
mlflow.set_tracking_uri("http://localhost:5005")
# real use case would point to the actual mlflow running server


In [5]:
%%bash

echo ${MLFLOW_TRACKING_URI}

http://localhost:5005


6. send logs to postgres / artifacts to s3://minio

In [6]:

mlflow.set_experiment("test loads on postgres & s3 bucket")

with mlflow.start_run():
     mlflow.log_params({
            "search_space_C": f"loguniform({1e-5}, {100})",
            "search_space_l1_ratio": f"uniform(0, 1)",
            "search_space_max_iter": f"arange(100, 1000, 100)"
        })
     mlflow.log_artifact("./requirements.txt")

2025/09/28 15:13:51 INFO mlflow.tracking.fluent: Experiment with name 'test loads on postgres & s3 bucket' does not exist. Creating a new experiment.


🏃 View run omniscient-carp-417 at: http://localhost:5005/#/experiments/1/runs/ad8270a6e77142a7a2f5d22eb463bf0e
🧪 View experiment at: http://localhost:5005/#/experiments/1


In [4]:
import numpy as np
import mlflow
from scipy.stats import loguniform, uniform
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Load files
X_train = np.loadtxt("src/data/X_train.csv", delimiter=",")
X_test = np.loadtxt("src/data/X_test.csv", delimiter=",")
y_train = np.loadtxt("src/data/y_train.csv", delimiter=",")
y_test = np.loadtxt("src/data/y_test.csv", delimiter=",")


mlflow.set_experiment("load model on cloud #2")

with mlflow.start_run():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(solver='saga', penalty='elasticnet'))
            ])
    
    param_distributions = {
            'classifier__C': loguniform(1e-5, 100),
            'classifier__l1_ratio': uniform(0, 1),
            'classifier__max_iter': np.arange(100, 1000, 100)
        }
    mlflow.log_params({
            "search_space_C": f"loguniform({1e-5}, {100})",
            "search_space_l1_ratio": f"uniform(0, 1)",
            "search_space_max_iter": f"arange(100, 1000, 100)"
        })
    
    n_iter = 2
    print(f"Running RandomizedSearchCV with n_iter={n_iter}...")
    random_search = RandomizedSearchCV( estimator=pipeline,
                                        param_distributions=param_distributions,
                                        n_iter=n_iter,
                                        cv=8,  # 8-fold cross-validation
                                        scoring='roc_auc',  # Use ROC AUC score for evaluation
                                        random_state=42,
                                        n_jobs=-1,  # Use all available CPU cores
                                        )

    random_search.fit(X_train, y_train)

    # --- 4. Log Best Results to MLflow ---
    # Get the best parameters and score from the search
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    best_estimator = random_search.best_estimator_

    # MLflow will log these as a single set of parameters for this run.
    print("Logging best parameters and cross-validation score...")
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_roc_auc", best_score)

    # Log the best estimator's details
    print("Logging best estimator model...")
    mlflow.sklearn.log_model(
        sk_model=best_estimator,
        name="best_model",
        input_example=X_train,
        registered_model_name="Best-logreg-from-RandomSearch", # mandatory if wants to save model as .pkl
    )
        
    

    # --- 5. Evaluate the Best Model on the Test Set ---
    print("Evaluating the best model on the test set...")
    y_pred = best_estimator.predict(X_test)
    y_pred_proba = best_estimator.predict_proba(X_test)[:, 1]

    test_accuracy = accuracy_score(y_test, y_pred)
    test_roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Log final metrics on the test set
    print("Logging final test metrics...")
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_roc_auc", test_roc_auc)

2025/09/28 11:40:00 INFO mlflow.tracking.fluent: Experiment with name 'load model on cloud #2' does not exist. Creating a new experiment.


Running RandomizedSearchCV with n_iter=2...
Logging best parameters and cross-validation score...
Logging best estimator model...
🏃 View run funny-lynx-644 at: http://localhost:5008/#/experiments/1/runs/8a7809fccfae4e86a0f2ef1df58418ee
🧪 View experiment at: http://localhost:5008/#/experiments/1


KeyboardInterrupt: 

In [11]:
import os 

os.getenv("MLFLOW_REMOTE_TRACKING_URI","sqlite:///src/mlruns.db")

'sqlite:///src/mlflow.db'

In [7]:
import os

# Set the environment variables
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('MIMICK_S3_USER')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('MIMICK_S3_PASSWORD')
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'  # MinIO default
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://172.19.0.2:9000"  # Typical MinIO endpoint


In [8]:
import boto3
try:
    s3 = boto3.client('s3')
    s3.head_bucket(Bucket='s3mimick')
    print("S3 connection successful")
except Exception as e:
    print(f"S3 connection failed: {e}")

S3 connection failed: An error occurred (403) when calling the HeadBucket operation: Forbidden


In [10]:
import os
print("Current environment variables:")
print(f"AWS_ACCESS_KEY_ID: {os.environ.get('AWS_ACCESS_KEY_ID', 'NOT SET')}")
print(f"AWS_SECRET_ACCESS_KEY: {os.environ.get('AWS_SECRET_ACCESS_KEY', 'NOT SET')}")
print(f"MLFLOW_S3_ENDPOINT_URL: {os.environ.get('MLFLOW_S3_ENDPOINT_URL', 'NOT SET')}")

# Check what your Docker environment variables are
print(f"MIMICK_S3_USER: {os.environ.get('MIMICK_S3_USER', 'NOT SET')}")
print(f"MIMICK_S3_PASSWORD: {os.environ.get('MIMICK_S3_PASSWORD', 'NOT SET')}")

Current environment variables:
AWS_ACCESS_KEY_ID: minio_user
AWS_SECRET_ACCESS_KEY: minio_password
MLFLOW_S3_ENDPOINT_URL: http://localhost:9000
MIMICK_S3_USER: minio_user
MIMICK_S3_PASSWORD: minio_password


# DEBUG

In [6]:
%%bash
# verify the container is up and running
echo "Checking Docker containers:"
docker compose -f compose_mimick_S3.yaml ps

echo -e "\nChecking container logs for bucket creation:"
docker compose -f compose_mimick_S3.yaml logs minio-create-s3_mimick

Checking Docker containers:
NAME                                IMAGE             COMMAND                  SERVICE    CREATED         STATUS                   PORTS
end-to-end-ml-pipeline-minio-1      minio/minio       "/usr/bin/docker-ent…"   minio      6 minutes ago   Up 6 minutes (healthy)   0.0.0.0:9000-9001->9000-9001/tcp, [::]:9000-9001->9000-9001/tcp
end-to-end-ml-pipeline-postgres-1   postgres:latest   "docker-entrypoint.s…"   postgres   6 minutes ago   Up 6 minutes             0.0.0.0:5432->5432/tcp, [::]:5432->5432/tcp

Checking container logs for bucket creation:
minio-create-s3_mimick-1  | Added `minio` successfully.
minio-create-s3_mimick-1  | mc: <ERROR> Unable to list folder. Bucket `s3mimick` does not exist.
minio-create-s3_mimick-1  | Bucket created successfully `minio/s3mimick`.
minio-create-s3_mimick-1  | Added `minio` successfully.
minio-create-s3_mimick-1  | s3mimick already exists


verify connection can be made to bucket

In [11]:
import os
import boto3
from botocore.client import Config



try:
    # Create S3 client with MinIO endpoint
    s3 = boto3.client(
        's3',
        endpoint_url=os.environ.get('MLFLOW_S3_ENDPOINT_URL'),
        aws_access_key_id=os.environ.get('MIMICK_S3_USER'),
        aws_secret_access_key=os.environ.get('MIMICK_S3_PASSWORD'),
        config=Config(signature_version='s3v4'),
        region_name='us-east-1'
    )
    
    # Test connection
    response = s3.list_buckets()
    print("S3 connection successful!")
    print("Available buckets:", [bucket['Name'] for bucket in response['Buckets']])
    
    # Test specific bucket
    s3.head_bucket(Bucket='s3mimick')
    print("Bucket 's3mimick' is accessible!")
    
except Exception as e:
    print(f"S3 connection failed: {e}")
    import traceback
    traceback.print_exc()

S3 connection successful!
Available buckets: ['s3mimick']
Bucket 's3mimick' is accessible!


✗ conda activate ML_Flow
✗ source .env
✗ docker compose -f compose_mimick_S3.yaml down
✗ mlflow server --backend-store-uri postgresql://${MIMICK_POSTGRES_USER}:${MIMICK_POSTGRES_PASSWORD}@localhost:5432/${MIMICK_POSTGRES_DATABASE} --artifacts-destination ${MIMICK_S3_BUCKET} --host 0.0.0.0 --port 5003
==> unable to locate credentials



✗ conda activate ML_Flow
✗ export AWS_ACCESS_KEY_ID=minio_user
✗ export AWS_SECRET_ACCESS_KEY=minio_password
✗ source .env
✗ docker compose -f compose_mimick_S3.yaml down
✗ mlflow server --backend-store-uri postgresql://${MIMICK_POSTGRES_USER}:${MIMICK_POSTGRES_PASSWORD}@localhost:5432/${MIMICK_POSTGRES_DATABASE} --artifacts-destination ${MIMICK_S3_BUCKET} --host 0.0.0.0 --port 5003
==> The AWS Access Key Id you provided does not exist in our records.


# IN VSCODE TERMINAL, the .env FILE CAN'T BE ACCSESSS BY MLFLOW
✗ conda activate ML_Flow
✗ docker compose -f compose_mimick_S3.yaml up -d
✗ source .env
✗ mlflow server --backend-store-uri postgresql://${MIMICK_POSTGRES_USER}:${MIMICK_POSTGRES_PASSWORD}@localhost:5432/${MIMICK_POSTGRES_DATABASE} --artifacts-destination ${MIMICK_S3_BUCKET} --host 0.0.0.0 --port 5003
==> no credential
# Necessecary to export manually
✗ export AWS_ENDPOINT_URL=http://localhost:9000
✗ export AWS_ACCESS_KEY_ID=minio_user
✗ export AWS_SECRET_ACCESS_KEY=minio_password
✗ mlflow server --backend-store-uri postgresql://${MIMICK_POSTGRES_USER}:${MIMICK_POSTGRES_PASSWORD}@localhost:5432/${MIMICK_POSTGRES_DATABASE} --artifacts-destination ${MIMICK_S3_BUCKET} --host 0.0.0.0 --port 5003
==> ok