In [1]:
!pip install mlflow boto3 xgboost==2.1.1 pandas numpy scikit-learn kserve kubernetes

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting boto3
  Downloading boto3-1.38.23-py3-none-any.whl.metadata (6.6 kB)
Collecting xgboost==2.1.1
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting kserve
  Downloading kserve-0.15.2-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-nccl-cu12 (from xgboost==2.1.1)
  Downloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import mlflow
import os
print(xgb.__version__)

2.1.1


In [3]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc.cluster.local:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minio"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"

In [4]:
def load_and_preprocess_data(file_path="example.csv"):
    data = pd.read_csv(file_path)
    
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        if col in data.columns:
            data[col] = data[col].fillna("unknown")
    
    data["price"] = data["price"].fillna(data["price"].median())
    
    label_encoders = {}
    for col in ["brand", "event_weekday", "category_code_level1", "category_code_level2"]:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            label_encoders[col] = le
            print(f"Categories in {col}: {le.classes_}")
    
    data = data[["brand", "price", "event_weekday", "category_code_level1", 
                 "category_code_level2", "activity_count", "is_purchased"]].copy()

        
    return data, label_encoders

In [5]:
def train_model(data):
    X = data.drop("is_purchased", axis=1)
    y = data["is_purchased"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    NUM_BOOST_ROUND = 100
    EARLY_STOPPING_ROUND = 5
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error", "auc"]
    }
    evals = [(dtest, "eval"), (dtrain, "train")]
    evals_result = {}  
    model = xgb.train(params, dtrain, num_boost_round=NUM_BOOST_ROUND, evals=evals, 
                      early_stopping_rounds=EARLY_STOPPING_ROUND, verbose_eval=True, evals_result=evals_result)
    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba > 0.5).astype(int)
    accuracy = np.mean(y_pred == y_test)
    try:
        with mlflow.start_run(run_name="model_training"):
            mlflow.log_params({
                "objective": "binary:logistic",
                "eval_metric": ["logloss", "error", "rmse", "mae", "auc"],
                "num_boost_round": NUM_BOOST_ROUND,
                "early_stopping_rounds": EARLY_STOPPING_ROUND
            })

            # Log evaluation metrics for each boosting round
            for metric in params["eval_metric"]:
                for dataset in ["eval", "train"]:
                    metric_name = f"{dataset}-{metric}"
                    metric_values = evals_result[dataset][metric]
                    for step, value in enumerate(metric_values):
                        mlflow.log_metric(metric_name, value, step=step)
            mlflow.xgboost.log_model(model, artifact_path="xgb_model")
            
            model_path = mlflow.get_artifact_uri("xgb_model")
            print("Model logged to:", model_path)
            # Log accuracy
            mlflow.log_metric("accuracy", accuracy)
            print(f"Metric logged: accuracy = {accuracy}")
        return model
    except Exception as e:
        print(f"MLflow logging failed: {e}")
     
    

In [6]:
mlflow.set_tracking_uri("http://mlflow-service.mlflow.svc.cluster.local:5000")

In [7]:
try:
    mlflow.set_experiment("xgboost-training")
except Exception as e:
    print(f"Failed to connect to remote MLflow server: {e}")

data, label_encoders = load_and_preprocess_data()
print("Data loaded and preprocessed successfully.")

model = train_model(data)

print("training done")

Categories in brand: ['acer' 'apple' 'coballe' 'gigabyte' 'haier' 'lucente' 'respect' 'samsung'
 'stels' 'unknown']
Categories in event_weekday: [3]
Categories in category_code_level1: ['apparel' 'appliances' 'computers' 'construction' 'electronics' 'kids']
Categories in category_code_level2: ['audio' 'carriage' 'components' 'kitchen' 'notebook' 'shoes' 'smartphone'
 'tools']
Data loaded and preprocessed successfully.
[0]	eval-logloss:0.81737	eval-error:0.50000	eval-auc:0.50000	train-logloss:0.56296	train-error:0.25000	train-auc:0.50000
[1]	eval-logloss:0.82080	eval-error:0.50000	eval-auc:0.50000	train-logloss:0.56275	train-error:0.25000	train-auc:0.50000
[2]	eval-logloss:0.82364	eval-error:0.50000	eval-auc:0.50000	train-logloss:0.56262	train-error:0.25000	train-auc:0.50000
[3]	eval-logloss:0.82600	eval-error:0.50000	eval-auc:0.50000	train-logloss:0.56252	train-error:0.25000	train-auc:0.50000
[4]	eval-logloss:0.82794	eval-error:0.50000	eval-auc:0.50000	train-logloss:0.56246	train-error



Model logged to: s3://mlflow-bucket/1/696745c055bc4a028dad0df8bcde2999/artifacts/xgb_model
Metric logged: accuracy = 0.5
🏃 View run model_training at: http://mlflow-service.mlflow.svc.cluster.local:5000/#/experiments/1/runs/696745c055bc4a028dad0df8bcde2999
🧪 View experiment at: http://mlflow-service.mlflow.svc.cluster.local:5000/#/experiments/1
training done


In [9]:
def upload_model_to_minio(model, model_name, bucket_name):
    import boto3
    model_name = model_name + ".ubj"
    model.save_model(model_name)
    s3 = boto3.resource(
        's3',
        endpoint_url=os.getenv("MLFLOW_S3_ENDPOINT_URL"),
        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
    )
    path_to_model = model_name
    s3_key = os.path.basename(model_name)
    bucket = s3.Bucket(bucket_name)
    if not bucket.creation_date:
        print(f"Bucket '{bucket_name}' doesn't exist. Creating it now...")
        bucket = s3.create_bucket(Bucket=bucket_name)
    
    print(f"Uploading model from to {bucket_name}/{s3_key}")
    bucket.upload_file(path_to_model, s3_key)
    print("Upload completed.")
    s3_path = f"s3://{bucket_name}/{model_name}"
    print(s3_path)
    return s3_path

In [10]:
s3_path = upload_model_to_minio(model, "xgbmodel", "mlpipeline")

Uploading model from to mlpipeline/xgbmodel.ubj
Upload completed.
s3://mlpipeline/xgbmodel.ubj


In [11]:
def model_serving(model_path):
    from kubernetes import client
    from kserve import KServeClient
    from kserve import constants
    from kserve import V1beta1InferenceService
    from kserve import V1beta1InferenceServiceSpec
    from kserve import V1beta1PredictorSpec
    from kserve import V1beta1ModelSpec

    namespace = "bigdata"
    name = f"bigdata-xgb"
    kserve_version = "v1beta1"
    api_version = constants.KSERVE_GROUP + "/" + kserve_version

    resources = client.V1ResourceRequirements(
        requests={"cpu": "100m", "memory": "512Mi"},
        limits={"cpu": "100m", "memory": "1Gi"}
    )

    isvc = V1beta1InferenceService(
        api_version=api_version,
        kind="InferenceService",
        metadata=client.V1ObjectMeta(
            name=name,
            namespace=namespace,
            annotations={'sidecar.istio.io/inject':'false'}
        ),
        spec=V1beta1InferenceServiceSpec(
            predictor=V1beta1PredictorSpec(
                service_account_name="sa-minio-kserve",
                model=V1beta1ModelSpec(
                    model_format={"name": "xgboost"},
                    storage_uri=model_path,
                    resources=resources
                )
            )
        )
    )

    kserve_client = KServeClient()
    kserve_client.create(isvc)
                                  
    

In [12]:
model_serving(s3_path)