# Kubeflow pipelines: From Training to Serving

## Prequisites

check to see if kfp is installed

In [None]:
! pip3 show kfp

## Configure Credentials
In order for KFServing to access MiniO, the credentials must be added to the default account

In [None]:
%%writefile minio_secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: minio-s3-secret
  annotations:
     serving.kubeflow.org/s3-endpoint: minio-service.kubeflow:9000
     serving.kubeflow.org/s3-usehttps: "0" # Default: 1. Must be 0 when testing with MinIO!
type: Opaque
data:
  AWS_ACCESS_KEY_ID: bWluaW8=
  AWS_SECRET_ACCESS_KEY: bWluaW8xMjM=
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: default
secrets:
  - name: minio-s3-secret

In [None]:
! kubectl apply -f minio_secret.yaml

## Configure access MinIO
### Upload the Dataset to MinIO
First, we configure credentials for mc, the MinIO command line client. We then use it to create a bucket, upload the dataset to it, and set access policy so that the pipeline can download it from MinIO.

In [None]:
! wget https://dl.min.io/client/mc/release/linux-amd64/mc
! chmod +x mc
! ./mc --help

### a. Connect to the MinIO Server

In [None]:
! ./mc alias set minio http://minio-service.kubeflow:9000 minio minio123

### b. Create a bucket to store the data and export the model to MinIO
The bucket is cleared once we are done running the pipeline

In [None]:
! ./mc mb minio/customer

### c. Upload the dataset to the bucket in MinIO
The dataset must be in a folder before uploading

In [None]:
! tar --dereference -czf datasets.tar.gz ./datasets
! ./mc cp datasets.tar.gz minio/customer/datasets.tar.gz
! ./mc policy set download minio/customer

If the dataset has been downloaded too many times while testing, the following code below can be used to clear out the bucket

In [None]:
# ! ./mc rm --recursive --force minio/mnist

## MinIO Server URL and Credentials

In [None]:
MINIO_SERVER='minio-service.kubeflow:9000'
MINIO_ACCESS_KEY='minio'
MINIO_SECRET_KEY='minio123'

## Implement Kubeflow Pipelines Components
In this pipeline, we have the following components:
*   Customer dataset download component
*   Train the Scikit-Learn model
*   Evaluate the trained model
*   Export the trained model


In [None]:
from typing import NamedTuple
import kfp
import kfp.components as components
import kfp.dsl as dsl
from kfp.components import InputPath, OutputPath #helps define the input & output between the components
import kubeflow.fairing.utils
NAMESPACE = kubeflow.fairing.utils.get_current_k8s_namespace()

## Component 1: Download the Customer Dataset

In [None]:
def download_dataset(minio_server: str, data_dir: OutputPath(str)):
    """Download the Customer data set to the KFP volume to share it among all steps"""
    import urllib.request
    import tarfile
    import os
    import subprocess

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    #this url leads to your bucket
    url = f'http://{minio_server}/customer/datasets.tar.gz'
    stream = urllib.request.urlopen(url)
    tar = tarfile.open(fileobj=stream, mode="r|gz")
    tar.extractall(path=data_dir)
    
    subprocess.call(["ls", "-lha", data_dir])

## Component 2: Train the Model

In [None]:
def train_model(data_dir: InputPath(str), model_dir: OutputPath(str)):
  import pickle
  import numpy as np
  import pandas as pd
  from sklearn import preprocessing
  from sklearn.model_selection import train_test_split
  from imblearn.over_sampling import RandomOverSampler
  from sklearn.linear_model import LogisticRegression

  # load the dataset
  df1 = pd.read_csv('/kaggle/input/customer-propensity-to-purchase-data/training_sample.csv')
  df2 = pd.read_csv('/kaggle/input/customer-propensity-to-purchase-data/testing_sample.csv')
  df = pd.concat([df1, df2])
  df.head()

  # drop extraneous features
  X = df.drop(['UserID', 'device_mobile', 'ordered', 'sign_in', ], axis=1)
  y = df['ordered']

  # split to training and test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

  # oversample the smallest class of the train set
  over_sampler = RandomOverSampler(random_state=42)
  X_train, y_train = over_sampler.fit_resample(X_train, y_train)
  
  # train the model
  model = LogisticRegression()
  model.fit(X_train, y_train)

  # save the train_data as a pickle file to be used by the train component
  with open(f'{data_path}/train_data', 'wb') as f:
    pickle.dump((X_train,  y_train), f)

  # save the test_data as a pickle file to be used by the train component
  with open(f'{data_path}/test_data', 'wb') as f:
    pickle.dump((X_test,  y_test), f)

  # save the model
  model.save(f'{data_path}/{model_dir}')

## Component 3: Evaluate the model

In [None]:
def evaluate_model(data_dir: InputPath(str), model_dir: InputPath(str), metrics_path: OutputPath(str)
) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
  
  import json
  from sklearn.metrics import confusion_matrix, accuracy_score

  # load the dataset
  with open(f'{data_path}/{test_data}', 'rb') as f:
    test_data = pickle.load(f)

  # separate the X_test from the y_test
  X_test, y_test = test_data

  # load the model
  model = load_model(f'{data_path}/{model_dir}')

  # use the model to predict on the test set
  y_pred = model.predict(X_test)

  # evaluate the model and print the results 
  accuracy = accuracy_score(y_test, y_pred)

  metrics = {
        "metrics": [{"name": "accuracy", "numberValue": str(accuracy), "format": "PERCENTAGE"}]
    }

  # save the metrics
  with open(metrics_path, "w") as f:
    json.dump(metrics, f)

  out_tuple = namedtuple("EvaluationOutput", ["mlpipeline_metrics"])

  return out_tuple(json.dumps(metrics))

## Component 4: Export the Model

In [None]:
def export_model(
    model_dir: InputPath(str),
    metrics: InputPath(str),
    export_bucket: str,
    model_name: str,
    model_version: int,
    minio_server: str,
    minio_access_key: str,
    minio_secret_key: str,
):
    import os
    import boto3
    from botocore.client import Config
    

    s3 = boto3.client(
        "s3",
        endpoint_url=f'http://{minio_server}',
        aws_access_key_id=minio_access_key,
        aws_secret_access_key=minio_secret_key,
        config=Config(signature_version="s3v4"),
    )

    # Create export bucket if it does not yet exist
    response = s3.list_buckets()
    export_bucket_exists = False

    print(response , export_bucket)
    for bucket in response["Buckets"]:
        if bucket["Name"] == export_bucket:
            export_bucket_exists = True

    if not export_bucket_exists:
        s3.create_bucket(ACL="public-read-write", Bucket=export_bucket)

    # Save model files to S3
    for root, dirs, files in os.walk(model_dir):
        for filename in files:
            local_path = os.path.join(root, filename)
            s3_path = os.path.relpath(local_path, model_dir)

            s3.upload_file(
                local_path,
                export_bucket,
                f"{model_name}/{model_version}/{s3_path}",
                ExtraArgs={"ACL": "public-read"},
            )

    response = s3.list_objects(Bucket=export_bucket)
    print(f"All objects in {export_bucket}:")
    for file in response["Contents"]:
        print("{}/{}".format(export_bucket, file["Key"]))