In [74]:
!pip install --upgrade pip
!pip install -q -r requirements.txt



## Imports and constants

In [75]:
import os
from datetime import datetime, timedelta

import pandas as pd
import psycopg2
import random
from feast import FeatureStore
from sqlalchemy import create_engine

In [76]:
%env REPO_PATH=mnist_demo/feature_repo/

env: REPO_PATH=mnist_demo/feature_repo/


In [77]:
psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'
historicalTableName = 'mnist_demo_mnist'
onlineTableName = 'mnist_demo_mnist_fresh'

## Create feature repository

Disable Feast usage reporting

In [78]:
os.environ['FEAST_USAGE']='False'

Tear down previous run

In [79]:
!feast --log-level=DEBUG -c $REPO_PATH teardown
!rm -rf $REPO_PATH

04/18/2024 09:37:11 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing
04/18/2024 09:37:11 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing
04/18/2024 09:37:11 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing


Init Feast repo `mnist_demo`

In [80]:
!feast init -m mnist_demo


Creating a new Feast repository in [1m[32m/opt/app-root/src/feast-workshop-team-share/feast_modelregistry/mnist_demo[0m.



Copy repo configuration from [repo](./repo) folder

In [81]:
!cp repo/* $REPO_PATH
!ls $REPO_PATH

feature_store.yaml  __init__.py  mnist_repo.py	__pycache__


Apply the repo configuration

In [83]:
!feast -c $REPO_PATH apply

Deploying infrastructure for [1m[32mmnist[0m
Deploying infrastructure for [1m[32mmnist_fresh[0m


Verify repo using `feast` CLI

In [84]:
!feast -c $REPO_PATH entities list
!feast -c $REPO_PATH feature-views list
!feast -c $REPO_PATH feature-services list
!feast -c $REPO_PATH data-sources list

NAME    DESCRIPTION    TYPE
image                  ValueType.UNKNOWN
NAME         ENTITIES    TYPE
mnist        {'image'}   FeatureView
mnist_fresh  {'image'}   FeatureView
NAME    FEATURES
mnist   mnist:feature_1, mnist:feature_2, mnist:feature_3, mnist:feature_4, mnist:feature_5, mnist:feature_6, mnist:feature_7, mnist:feature_8, mnist:feature_9, mnist:feature_10, mnist:feature_11, mnist:feature_12, mnist:feature_13, mnist:feature_14, mnist:feature_15, mnist:feature_16, mnist:feature_17, mnist:feature_18, mnist:feature_19, mnist:feature_20, mnist:feature_21, mnist:feature_22, mnist:feature_23, mnist:feature_24, mnist:feature_25, mnist:feature_26, mnist:feature_27, mnist:feature_28
NAME                CLASS
images_push_source  <class 'feast.data_source.PushSource'>
mnist_source        <class 'feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source.PostgreSQLSource'>
mnist_push_source   <class 'feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source.

### Content validation

Select some random historical data and verify it matches the initial MNIST dataset

In [106]:
image_ids = [random.randint(0, 9) for _ in range(10)]
ts = []

engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
for image_id in image_ids:
    first_ts = list(engine.execute(f"SELECT ts FROM {mnistTableName} WHERE image_id={image_id}").fetchall())[0]
    first_ts = first_ts.items()[0][1]
    ts.append(first_ts)

print(f'Testing for {image_ids} and {ts}')
entity_df = pd.DataFrame.from_dict(
    {
        "image_id": image_ids,
        "event_timestamp": ts,
    }
)

store = FeatureStore(repo_path=os.environ['REPO_PATH'])    
test_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "mnist:feature_1",
        "mnist:feature_19",
        "mnist:number",
    ],
).to_df()

  first_ts = first_ts.items()[0][1]


Testing for [9, 1, 2, 5, 5, 3, 3, 8, 4, 8] and [datetime.datetime(2024, 4, 10, 7, 2, 56), datetime.datetime(2024, 4, 10, 7, 2, 46), datetime.datetime(2024, 4, 10, 7, 3, 6), datetime.datetime(2024, 4, 10, 7, 2, 16), datetime.datetime(2024, 4, 10, 7, 2, 16), datetime.datetime(2024, 4, 10, 7, 3, 26), datetime.datetime(2024, 4, 10, 7, 3, 26), datetime.datetime(2024, 4, 10, 7, 5, 6), datetime.datetime(2024, 4, 10, 7, 2, 36), datetime.datetime(2024, 4, 10, 7, 5, 6)]


In [107]:
test_df.head()

Unnamed: 0,image_id,event_timestamp,feature_1,feature_19,number
0,9,2024-04-10 07:02:56,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",9
1,1,2024-04-10 07:02:46,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.917...",1
2,2,2024-04-10 07:03:06,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.011764705882352941,0.53...",2
3,5,2024-04-10 07:02:16,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",5
4,5,2024-04-10 07:02:16,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",5


In [108]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   image_id         10 non-null     int64         
 1   event_timestamp  10 non-null     datetime64[ns]
 2   feature_1        10 non-null     object        
 3   feature_19       10 non-null     object        
 4   number           10 non-null     int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 528.0+ bytes


## DB validation

In [109]:
def countDbRows():
    engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
    mnist_count = engine.execute(f"SELECT COUNT(*) FROM {mnistTableName}").scalar()
    historical_count = engine.execute(f"SELECT COUNT(*) FROM {historicalTableName}").scalar()
    online_count = engine.execute(f"SELECT COUNT(*) FROM {onlineTableName}").scalar()
    return (mnist_count, historical_count, online_count)

In [110]:
mnist_count, historical_count, online_count = countDbRows()

assert historical_count==0, f"Row count for {historicalTableName} is not 70000 but {historical_count}"
assert online_count==0, f"Row count for {onlineTableName} is not 70000 but {online_count}"
assert mnist_count==70000, f"Row count for {mnistTableName} is not 70000 but {mnist_count}"

## Features materialization

Materialization is ythe process to ingest batch features and streaming features (via a Push API) into the online store.

See [Quickstart](https://docs.feast.dev/getting-started/quickstart#step-3c-ingest-batch-features-into-your-online-store)

In [112]:
os.environ['CURRENT_TIME']=str(datetime.now().replace(microsecond=0))
!echo "Materializing up to $CURRENT_TIME"
!cd $REPO_PATH;feast materialize-incremental "$CURRENT_TIME"

Materializing up to 2024-04-18 10:19:49
Materializing [1m[32m2[0m feature views to [1m[32m2024-04-18 10:19:49+00:00[0m into the [1m[32mpostgres[0m online store.

[1m[32mmnist_fresh[0m from [1m[32m2024-04-08 10:19:54+00:00[0m to [1m[32m2024-04-18 10:19:49+00:00[0m:
0it [00:00, ?it/s]
[1m[32mmnist[0m from [1m[32m2024-04-18 10:07:59+00:00[0m to [1m[32m2024-04-18 10:19:49+00:00[0m:
0it [00:00, ?it/s]


Validate DB row count after materialization

In [116]:
mnist_count, historical_count, online_count = countDbRows()

# Cannot predict row count, so just validate they are > 0
assert historical_count>0, f"Row count for {historicalTableName} is {historical_count}"
assert online_count==0, f"Row count for {onlineTableName} is {online_count}"
assert mnist_count==70000, f"Row count for {mnistTableName} is not 70000 but {mnist_count}"
print(f"Row count for {historicalTableName} is {historical_count}")
print(f"Row count for {onlineTableName} is {online_count}")
print(f"Row count for {mnistTableName} is {mnist_count}")

Row count for mnist_demo_mnist is 300
Row count for mnist_demo_mnist_fresh is 0
Row count for mnist_source is 70000
