In [1]:
!pip install --upgrade pip
!pip install -q -r requirements.txt



## Imports and constants

In [23]:
import os
from datetime import datetime, timedelta

import pandas as pd
import psycopg2
from feast import FeatureStore
from sqlalchemy import create_engine

In [12]:
ts = datetime(2024, 4, 2, 12, 30, 0)

In [13]:
%env REPO_PATH=mnist_demo/feature_repo/

env: REPO_PATH=mnist_demo/feature_repo/


In [25]:
psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'
historicalTableName = 'mnist_demo_mnist'
onlineTableName = 'mnist_demo_mnist_fresh'

## Create feature repository

Disable Feast usage reporting

In [14]:
os.environ['FEAST_USAGE']='False'

Tear down previous run

In [15]:
!feast --log-level=DEBUG -c $REPO_PATH teardown
!rm -rf $REPO_PATH

04/12/2024 08:23:46 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing
04/12/2024 08:23:46 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing
04/12/2024 08:23:46 AM feast.infra.registry.registry INFO: Registry cache expired, so refreshing


Init Feast repo `mnist_demo`

In [16]:
!feast init -m mnist_demo


Creating a new Feast repository in [1m[32m/opt/app-root/src/feast-workshop-team-share/feast_modelregistry/mnist_demo[0m.



Copy repo configuration from [repo](./repo) folder

In [17]:
!cp repo/* $REPO_PATH
!ls $REPO_PATH

feature_store.yaml  __init__.py  mnist_repo.py	__pycache__


Apply the repo configuration

In [18]:
!feast -c $REPO_PATH apply

Deploying infrastructure for [1m[32mmnist_fresh[0m
Deploying infrastructure for [1m[32mmnist[0m


Verify repo using `feast` CLI

In [19]:
!feast -c $REPO_PATH entities list
!feast -c $REPO_PATH feature-views list
!feast -c $REPO_PATH feature-services list

NAME    DESCRIPTION    TYPE
image                  ValueType.UNKNOWN
NAME         ENTITIES    TYPE
mnist_fresh  {'image'}   FeatureView
mnist        {'image'}   FeatureView
NAME    FEATURES
mnist   mnist:feature_1, mnist:feature_2, mnist:feature_3, mnist:feature_4, mnist:feature_5, mnist:feature_6, mnist:feature_7, mnist:feature_8, mnist:feature_9, mnist:feature_10, mnist:feature_11, mnist:feature_12, mnist:feature_13, mnist:feature_14, mnist:feature_15, mnist:feature_16, mnist:feature_17, mnist:feature_18, mnist:feature_19, mnist:feature_20, mnist:feature_21, mnist:feature_22, mnist:feature_23, mnist:feature_24, mnist:feature_25, mnist:feature_26, mnist:feature_27, mnist:feature_28


### Content validation

Select some random historical data and verify it matches the initial MNIST dataset

In [20]:
print(f'Testing for {ts}')
entity_df = pd.DataFrame.from_dict(
    {
        "image_id": [random.randint(1, 70000) for _ in range(100)],
        "event_timestamp": [ts] * 100,
    }
)

store = FeatureStore(repo_path=os.environ['REPO_PATH'])    
test_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "mnist:feature_1",
        "mnist:feature_19",
        "mnist:number",
    ],
).to_df()

Testing for 2024-04-02 12:30:00


In [21]:
test_df.head()

Unnamed: 0,image_id,event_timestamp,feature_1,feature_19,number
0,58388,2024-04-02 12:30:00,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",1
1,36729,2024-04-02 12:30:00,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.992...",2
2,62833,2024-04-02 12:30:00,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",5
3,26716,2024-04-02 12:30:00,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290...",3
4,65908,2024-04-02 12:30:00,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.047058823529411764,...",2


In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   image_id         100 non-null    int64         
 1   event_timestamp  100 non-null    datetime64[ns]
 2   feature_1        100 non-null    object        
 3   feature_19       100 non-null    object        
 4   number           100 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 4.0+ KB


## DB validation

In [26]:
def countDbRows():
    engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
    mnist_count = engine.execute(f"SELECT COUNT(*) FROM {mnistTableName}").scalar()
    historical_count = engine.execute(f"SELECT COUNT(*) FROM {historicalTableName}").scalar()
    online_count = engine.execute(f"SELECT COUNT(*) FROM {onlineTableName}").scalar()
    return (mnist_count, historical_count, online_count)

In [31]:
mnist_count, historical_count, online_count = countDbRows()

assert historical_count==0, f"Row count for {historicalTableName} is not 70000 but {historical_count}"
assert online_count==0, f"Row count for {onlineTableName} is not 70000 but {online_count}"
assert mnist_count==70000, f"Row count for {mnistTableName} is not 70000 but {mnist_count}"

## Features materialization

Materialization is ythe process to ingest batch features and streaming features (via a Push API) into the online store.

See [Quickstart](https://docs.feast.dev/getting-started/quickstart#step-3c-ingest-batch-features-into-your-online-store)

In [44]:
os.environ['CURRENT_TIME']=str(datetime.now().replace(microsecond=0))
!echo "Materializing up to $CURRENT_TIME"
!cd $REPO_PATH;feast materialize-incremental "$CURRENT_TIME"

Materializing up to 2024-04-12 08:40:31
Materializing [1m[32m2[0m feature views to [1m[32m2024-04-12 08:40:31+00:00[0m into the [1m[32mpostgres[0m online store.

[1m[32mmnist_fresh[0m from [1m[32m2024-04-02 08:40:35+00:00[0m to [1m[32m2024-04-12 08:40:31+00:00[0m:
2100000it [02:36, 13405.54it/s]                                                                     
[1m[32mmnist[0m from [1m[32m2024-04-02 08:43:18+00:00[0m to [1m[32m2024-04-12 08:40:31+00:00[0m:
2100000it [02:41, 12967.72it/s]                                                                     


Validate DB row count after materialization

In [48]:
mnist_count, historical_count, online_count = countDbRows()

# Cannot predict row ccount, so just validate they are > 0
assert historical_count>0, f"Row count for {historicalTableName} is not 70000 but {historical_count}"
assert online_count>0, f"Row count for {onlineTableName} is not 70000 but {online_count}"
assert mnist_count==70000, f"Row count for {mnistTableName} is not 70000 but {mnist_count}"
print(f"Row count for {historicalTableName} is {historical_count}")
print(f"Row count for {onlineTableName} is {online_count}")

Row count for mnist_demo_mnist is 2100000
Row count for mnist_demo_mnist_fresh is 2100000
