# Train Models using Feast historical data

* Collect historical features from Feast
* Rebuild the MNIST dataset from the features
* Train different models using the dataset
* Register models using Model Registry

In [None]:
!pip install --upgrade pip
!cat requirements.txt
!pip install -q -r requirements.txt
!pip install --no-deps --ignore-requires-python "https://github.com/opendatahub-io/ml-metadata/releases/download/v1.14.0%2Bremote.1/ml_metadata-1.14.0+remote.1-py3-none-any.whl" # need a Python 3.11 compatible version
!pip install --no-deps --ignore-requires-python "model-registry==0.1.2" # ignore dependencies because of the above override

## Imports and constants

In [6]:
import ast
import os
from datetime import datetime, timedelta

import boto3
import matplotlib.pyplot as plt
import numpy as np
import onnx
import onnxruntime as ort
import pprint
import pandas as pd
import tensorflow as tf
import tf2onnx
from feast import FeatureStore
from IPython.display import Markdown as md
from model_registry import ModelRegistry
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine, MetaData, Table, select, Column, Integer, DateTime
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten

2024-04-18 11:41:11.030648: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Note**: update this value to match the actual data

In [7]:
%env REPO_PATH=mnist_demo/feature_repo/

env: REPO_PATH=mnist_demo/feature_repo/


In [8]:
os.environ['accesskey'] = 'minio'
os.environ['secretkey'] = 'minio123'
s3url = 'http://minio-service.feast.svc.cluster.local:9000'
bucket_name = 'feast'

In [9]:
psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'

## Reusable functions

In [10]:
def simpleNN():
    model = Sequential()

    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(32,activation='relu'))
    model.add(Dense(10,activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

    model.summary()
    return model, 'simple_NN'

In [11]:
def convolutedNN():
    model = Sequential()

    model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1)))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=64, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(rate=0.2))
    model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.summary()
    return model, 'convolutedNN'

In [12]:
def evaluateModelAccuracy(model, model_name):
    y_prob = model.predict(X_test)
    y_pred = y_prob.argmax(axis=1)
    accuracy = accuracy_score(y_test,y_pred)
    print(f'Prediction accuracy for model `{model_name}` is: {round(accuracy * 100, 2)}%')

In [13]:
def testModelForSample(model, sample_id):
    plt.imshow(X_test[sample_id], cmap="Greys")
    plt.title(f'X_test[{sample_id}]:')
    plt.show()
    prediction = model.predict(X_test[sample_id].reshape(1,28,28)).argmax(axis=1)[0]
    print(f'prediction for sample {sample_id} is: ', prediction)
    print(f'**Note**: the calculated prediction {prediction} must match the number plotted above. If not, the test failed')

In [14]:
def saveModel(model, model_name):
    input_signature = [tf.TensorSpec([1, 28, 28], tf.double, name='x')]
    onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=12)
    file_name = f"{model_name}.onnx"
    onnx.save(onnx_model, file_name)
    print(f"Saved as {file_name}")
    return file_name

In [15]:
def testModelFromFileForSample(file_name, sample_id):
    plt.imshow(X_test[sample_id], cmap="Greys")
    plt.title(f'X_test[{sample_id}]:')
    plt.show()
    model_onnx = onnx.load(file_name)
    output = [node.name for node in model_onnx.graph.output]
    print(output)

    sess = ort.InferenceSession(file_name)
    results_ort = sess.run([output[0]], {'x': X_test[sample_id].reshape(1,28,28)})
    prediction = results_ort[0].argmax(axis=1)[0]
    print(f'**Note**: the calculated prediction {prediction} must match the number plotted above. If not, the test failed')

In [16]:
def storeTrainedModel(model, model_name, file_name):
    registeredmodel_name = "mnist"
    version_name = "v."+model_name+"."+datetime.now().strftime("%Y%m%d%H%M%S")
    print(f"Will be using: {registeredmodel_name}:{version_name} in the remainder of this task")

    # Updated config for https://play.min.io:9443/
    s3 = boto3.resource(
        service_name='s3',
        # region_name='default',
        aws_access_key_id=os.environ['accesskey'],
        aws_secret_access_key=os.environ['secretkey'],
        # use_ssl=False,
        endpoint_url=s3url,
        # config=boto3.session.Config(signature_version='s3v4'),
        verify=False
    )

    odh_secret_name = f'aws-connection-{bucket_name}'
    in_bucket_path = version_name
    in_bucket_target = f'{in_bucket_path}/{file_name}'
    full_bucket_target = f's3://{bucket_name}/{in_bucket_target}'

    my_bucket = s3.Bucket(bucket_name)
    my_bucket.upload_file(file_name, in_bucket_target)

    print(f"Objects in the {bucket_name} bucket:")
    for obj in my_bucket.objects.filter():
        print(obj.key)
    return (registeredmodel_name, version_name, odh_secret_name, in_bucket_path,in_bucket_target, full_bucket_target)

In [17]:
def registerToModelRegistry(
    registeredmodel_name,
    version_name,
    odh_secret_name,
    in_bucket_path,
    in_bucket_target,
    full_bucket_target):
    registry = ModelRegistry(server_address="modelregistry-sample.feast.svc.cluster.local", port=9090, author="feast-dev@redhat.com")

    rm = registry.register_model(registeredmodel_name,
                                    full_bucket_target,
                                    model_format_name="onnx",
                                    model_format_version="1",
                                    storage_key=odh_secret_name,
                                    storage_path=in_bucket_path,
                                    version=version_name,
                                    description="demo20231121 e2e MNIST",
                                    )
    print("RegisteredModel:")
    print(registry.get_registered_model(registeredmodel_name))
    print("ModelVersion:")
    print(registry.get_model_version(registeredmodel_name, version_name))
    print("ModelArtifact:")
    print(registry.get_model_artifact(registeredmodel_name, version_name))
    return registry

## Collect historical data

Use SQL entity definition to collect all (`image_id`, `ts`) tuple from the MNIST dataset.

In [18]:
# Extract all image_is, ts tuples

engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
metadata = MetaData()
table = Table(mnistTableName, metadata, autoload=True, autoload_with=engine)

columns = [table.c.image_id, table.c.ts.label('event_timestamp')]
stmt = select(columns)

image_ids = []
ts = []
with engine.connect() as conn:
    result = conn.execute(stmt)
    for row in result:
        image_ids.append(row['image_id'])
        ts.append(row['event_timestamp'])

entity_df = pd.DataFrame.from_dict(
    {
        "image_id": image_ids,
        "event_timestamp": ts,
    }
)
entity_df.head()

Unnamed: 0,image_id,event_timestamp
0,5,2024-04-10 07:02:16
1,0,2024-04-10 07:02:26
2,4,2024-04-10 07:02:36
3,1,2024-04-10 07:02:46
4,9,2024-04-10 07:02:56


The execution of the following step that collects all the historical features for all the timestamps may take some time

In [None]:
store = FeatureStore(repo_path=os.environ['REPO_PATH'])    
features = [f"mnist:feature_{i+1}" for i in range(28)]
features.append("mnist:number")
historical_df = store.get_historical_features(
    entity_df=entity_df,
    features=features,
).to_df()

In [None]:
len(historical_df)
assert len(historical_df)==70000, f"Found {len(historical_df)} instead of 70000"

In [None]:
historical_df.head()

In [None]:
historical_df.info()

## Prepara dataset

Remove rows with null features

In [None]:
for id in range(28):
    historical_df.dropna(subset=[f'feature_{id+1}'], inplace=True)

Rebuild the MNIST dataset.

Goal is to recreate the `numpy` arrays that can feed the trained models.

In [None]:
images = []

Load features and revert column to rows.

**Note**: `feature_N` columns contain string representation of `list<float>`, so they must be reverted to the original data types.

The execution of these commands can take some minutes.

In [None]:
for index in range(len(historical_df)):
    image_id = historical_df['image_id'].iloc[index]
    # print(f'Rebuild {image_id} at index {index}')
    image = [list(map(float, ast.literal_eval(
        historical_df[f'feature_{id+1}'].iloc[index]))) for id in range(28)]
    images.append(image)

Extract the prediction column `number`

In [None]:
numbers = []
for index in range(len(historical_df)):
    numbers.append(int(historical_df['number'].iloc[index]))
print(len(numbers))

Assign a fixed percentage of 20% to testing data

In [None]:
split = int(len(images) * 0.8)
train_images = images[:split]
test_images = images[split:]
train_numbers = numbers[:split]
test_numbers = numbers[split:]

X_train = np.array(train_images)
y_train = np.array(train_numbers)
X_test = np.array(test_images)
y_test = np.array(test_numbers)

In [None]:
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

Plot some data samples to validate the transformation

In [None]:
for i in range(9):  
    plt.subplot(330 + 1 + i)
    plt.imshow(X_train[i], cmap=plt.get_cmap('gray'))
plt.show()

## Train simple neural network

Let's train a simple neural network.

In [None]:
model, model_name = simpleNN()

In [None]:
history = model.fit(X_train,y_train,epochs=10,validation_split=0.2)

Evaluate model accuracy using test data

In [None]:
evaluateModelAccuracy(model, model_name)

### Evaluate the trained model

In [None]:
testModelForSample(model, 1)

### Save the model as ONNX file

In [None]:
file_name = saveModel(model, model_name)

### Test the saved model

In [None]:
testModelFromFileForSample(file_name, 15)

### Store the model to S3 compatible bucket

In [None]:
registeredmodel_name, version_name, odh_secret_name, in_bucket_path,in_bucket_target, full_bucket_target = storeTrainedModel(model, model_name, file_name)

In [None]:
registeredmodel_name, version_name

### Register with Model Registry

In [None]:
registerToModelRegistry(
    registeredmodel_name, version_name, odh_secret_name, in_bucket_path,in_bucket_target, full_bucket_target
)

## Train a convoluted neural network

Let's train a an alternative, a convoluted neural network:

In [None]:
model, model_name = convolutedNN()

In [None]:
history = model.fit(X_train,y_train,epochs=3)

In [None]:
evaluateModelAccuracy(model, model_name)

### Evaluate the trained model

In [None]:
testModelForSample(model, 19)

### Save the model as ONNX file

In [None]:
file_name = saveModel(model, model_name)

In [None]:
testModelFromFileForSample(file_name, 17)

In [None]:
registeredmodel_name, version_name, odh_secret_name, in_bucket_path,in_bucket_target, full_bucket_target = storeTrainedModel(model, model_name, file_name)

In [None]:
registry = registerToModelRegistry(
    registeredmodel_name, version_name + "1", odh_secret_name, in_bucket_path,in_bucket_target, full_bucket_target
)

## Inspect Model Registry

In [None]:
models = registry._api.get_registered_models()
assert len(models)==1, f"Found {len(models)} instead of just 1"
registered_model = models[0]
print(registered_model)

In [None]:
# Utility function to archive old versions
# Uncomment and place the desired model version ids in the range() configuration 
# and the desired model id in the call to upsert_model_version()

from model_registry.types import ContextState
# for id in range(3,10):
#     m = registry._api.get_model_version_by_id(id)
#     if m != None:
#         m.state=ContextState.ARCHIVED
#         try:
#             res = registry._api.upsert_model_version(m, 1)
#         except Exception as e:
#             print(f"AlreadyExistsError for {id}")

In [None]:
registered_model_id = registered_model.id
live_model_versions = [m for m in registry._api.get_model_versions(registered_model_id=registered_model_id) if m.state!=ContextState.ARCHIVED]
assert len(live_model_versions)==2, f"Found {len(live_model_versions)} instead of just 2"

for live_model_version in live_model_versions:
    print(live_model_version)