In [None]:
%pip install --upgrade pip
%pip install tensorflow pandas scikit-learn mlflow
%pip install boto3

Collecting boto3
  Downloading boto3-1.38.0-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.39.0,>=1.38.0 (from boto3)
  Downloading botocore-1.38.0-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.13.0,>=0.12.0 (from boto3)
  Downloading s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.38.0-py3-none-any.whl (139 kB)
Downloading botocore-1.38.0-py3-none-any.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.12.0-py3-none-any.whl (84 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.38.0 botocore-1.38.0 jmespath-1.0.1 s3transfer-0.12.0


In [2]:
# ──────────────── 1. Load MNIST Dataset ──────────────── #
def load_mnist_data() -> str:
    import tensorflow as tf
    import pandas as pd

    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    x_train = x_train.reshape((x_train.shape[0], -1))  # flatten 28x28 to 784
    df = pd.DataFrame(x_train)
    df["label"] = y_train

    print("MNIST data sample:\n", df.head())
    return df.to_csv(index=False)

In [3]:
# ──────────────── 2. EDA + Cleaning ──────────────── #
def eda_and_clean(csv_data: str) -> str:
    import pandas as pd
    from io import StringIO

    df = pd.read_csv(StringIO(csv_data))
    print("EDA loaded shape:", df.shape)

    df = df.dropna()
    return df.to_csv(index=False)

In [4]:
# ──────────────── 3. Train TF Model ──────────────── #
def train_model(csv_data: str) -> str:
    import pandas as pd
    import tensorflow as tf
    import json
    from io import StringIO
    from sklearn.model_selection import train_test_split

    MODEL_SAVE_PATH = "mnist_model.keras"

    df = pd.read_csv(StringIO(csv_data))
    print("Training data shape:", df.shape)

    X = df.drop(columns=["label"]).values
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation="relu", input_shape=(784,)),
        tf.keras.layers.Dense(10, activation="softmax")
    ])
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.fit(X_train, y_train, epochs=2, validation_split=0.1)
    _, accuracy = model.evaluate(X_test, y_test)

    model.save(MODEL_SAVE_PATH)

    return json.dumps({"accuracy": accuracy})

In [5]:
# ──────────────── 4. Evaluate & Log to MLflow ──────────────── #
def log_to_mlflow(metrics_json: str):
    import mlflow
    import json

    MLFLOW_TRACKING_URI = "http://mlflow.kubeflow-user-example-com.svc:5000"
    MLFLOW_EXPERIMENT_NAME = "mnist-tf-pipeline"

    metrics = json.loads(metrics_json)

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

    with mlflow.start_run():
        mlflow.log_metric("accuracy", metrics["accuracy"])

In [None]:
# ──────────────── 5. Upload Model to MLflow ──────────────── #
def upload_model():
    import mlflow
    import tensorflow as tf 
    import numpy as np
    import os
    import boto3


    MLFLOW_TRACKING_URI = "http://mlflow.kubeflow-user-example-com.svc:5000"
    MODEL_SAVE_PATH = "mnist_model.keras"
    S3_ENDPOINT = "http://minio.kubeflow-user-example-com.svc:9000"
    MLFLOW_BUCKET_NAME = "mlflow"
    
    # 1) Credentials
    os.environ["AWS_ACCESS_KEY_ID"]     = "minioDev"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minioDevPass123"
    # 2) Tell MLflow/boto3 to use MinIO, not AWS
    os.environ["MLFLOW_S3_ENDPOINT_URL"] = S3_ENDPOINT
    os.environ["AWS_S3_VERIFY"]          = "false"


    # Create mlflow
    s3 = boto3.resource(
        's3',
        endpoint_url=S3_ENDPOINT,
        region_name='us-east-1',
        aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        config=boto3.session.Config(signature_version='s3v4'),
        verify=False
    )
    

    if not s3.Bucket(MLFLOW_BUCKET_NAME) in s3.buckets.all():
        s3.create_bucket(Bucket=MLFLOW_BUCKET_NAME)
        print(MLFLOW_BUCKET_NAME + " bucket created!")
    else:
        print(MLFLOW_BUCKET_NAME + " bucket already exists!")
    

    # Load the actual model
    model = tf.keras.models.load_model(MODEL_SAVE_PATH)

    
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.tensorflow.log_model(model=model, artifact_path=MODEL_SAVE_PATH)


In [13]:
# ──────────────── Run Everything ──────────────── #

print("▶️ Step 1: Loading data...")
raw_csv = load_mnist_data()

▶️ Step 1: Loading data...
MNIST data sample:
    0  1  2  3  4  5  6  7  8  9  ...  775  776  777  778  779  780  781  782  \
0  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
1  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
2  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
3  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   
4  0  0  0  0  0  0  0  0  0  0  ...    0    0    0    0    0    0    0    0   

   783  label  
0    0      5  
1    0      0  
2    0      4  
3    0      1  
4    0      9  

[5 rows x 785 columns]


In [8]:
print("▶️ Step 2: Cleaning data...")
cleaned_csv = eda_and_clean(raw_csv)

▶️ Step 2: Cleaning data...
EDA loaded shape: (60000, 785)


In [9]:
print("▶️ Step 3: Training model...")
metrics_json = train_model(cleaned_csv)

▶️ Step 3: Training model...
Training data shape: (60000, 785)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/2


2025-04-23 09:26:16.619186: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 270950400 exceeds 10% of free system memory.


[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8022 - loss: 7.7975 - val_accuracy: 0.8813 - val_loss: 0.5387
Epoch 2/2
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8933 - loss: 0.4541 - val_accuracy: 0.8969 - val_loss: 0.4309
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8939 - loss: 0.4404  


In [10]:
print("▶️ Step 4: Logging to MLflow...")
log_to_mlflow(metrics_json)

▶️ Step 4: Logging to MLflow...
🏃 View run silent-ray-232 at: http://mlflow.kubeflow-user-example-com.svc:5000/#/experiments/3/runs/4f7e5d1725cd421daa9d93297a670a22
🧪 View experiment at: http://mlflow.kubeflow-user-example-com.svc:5000/#/experiments/3


In [23]:
print("▶️ Step 5: Uploading model to MLflow...")
upload_model()

print("✅ Pipeline finished.")



▶️ Step 5: Uploading model to MLflow...
mlflow bucket already exists!




✅ Pipeline finished.
