In [1]:
!pip install --upgrade pip
!pip install -q -r ../feast_modelregistry/requirements.txt
!pip install pyarrow



# Feast Remote Offline Store

Defines an `RemoteOfflineStore` implementation of the `OfflineStore` interface delegating the actual data manipulation to
the remote Offline Store realized by the Arrow Flight server.

## Constants and imports

**Note**: This step is needed only because of the nature of the MNIST dataset.

In [2]:
psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'

In [3]:
import pandas as pd
import pyarrow as pa
import uuid

from feast.feature_view import FeatureView
from feast.infra.offline_stores.offline_store import (
    OfflineStore,
    RetrievalJob,
)
from feast.infra.registry.registry import Registry
from feast.on_demand_feature_view import OnDemandFeatureView
from feast.repo_config import RepoConfig
from feast.usage import log_exceptions_and_usage
import pyarrow.flight as fl
from sqlalchemy import create_engine, MetaData, Table, select
from typing import List, Union, Optional

## Prepare the input parameters

Connect the MNIST data source to read the list of (`image_id`, `event_timestamp`) pairs used to fetch the historical features.

In [4]:
engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
metadata = MetaData()
table = Table(mnistTableName, metadata, autoload=True, autoload_with=engine)

columns = [table.c.image_id, table.c.ts.label('event_timestamp')]
stmt = select(columns)

image_ids = []
ts = []
with engine.connect() as conn:
    result = conn.execute(stmt)
    for row in result:
        image_ids.append(row['image_id'])
        ts.append(row['event_timestamp'])

entity_df = pd.DataFrame.from_dict(
    {
        "image_id": image_ids,
        "event_timestamp": ts,
    }
)
entity_df.head()

  table = Table(mnistTableName, metadata, autoload=True, autoload_with=engine)


Unnamed: 0,image_id,event_timestamp
0,5,2016-05-11 16:14:44
1,0,2016-05-11 17:14:44
2,4,2016-05-11 18:14:44
3,1,2016-05-11 19:14:44
4,9,2016-05-11 20:14:44


## Define the RemoteOfflineStore

Basic implementation of the `RemoteOfflineStore` and `RemoteRetrievalJob` classes following the data exchange protocol
defined in the [README.md](./README.md)

In [5]:
class RemoteRetrievalJob(RetrievalJob):
    def __init__(
        self,
        arrow_host,
        arrow_port,
        feature_refs: List[str],
        entity_df: Union[pd.DataFrame, str],
        # TODO add missing parameters from the OfflineStore API
    ):
        # Generate unique command identifier
        self.command = str(uuid.uuid4())
        # Initialize the client connection
        self.client = pa.flight.connect(f"grpc://{arrow_host}:{arrow_port}")
        # Put API parameters
        self._put_parameters(feature_refs, entity_df)

    def _put_parameters(self, feature_refs, entity_df):
        record_batch_entity = pa.Table.from_pandas(entity_df)
        flight_info_entity = pa.flight.FlightDescriptor.for_command(self.command)
        writer, _ = self.client.do_put(flight_info_entity,
                                       record_batch_entity.schema.with_metadata({
                                           'command': self.command, 
                                           'api': 'get_historical_features', 
                                           'param': 'entity_df'}))
        writer.write_table(record_batch_entity)
        writer.close()

        features_array = pa.array(feature_refs)
        features_batch = pa.RecordBatch.from_arrays([features_array], ['features'])
        writer, _ = self.client.do_put(flight_info_entity,
                                       features_batch.schema.with_metadata({
                                           'command': self.command, 
                                           'api': 'get_historical_features', 
                                           'param': 'features'}))
        writer.write_batch(features_batch)
        writer.close()

    # Invoked to realize the Pandas DataFrame
    def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame:
        # We use arrow format because it gives better control of the table schema
        return self._to_arrow_internal().to_pandas()

    # Invoked to synchronously execute the underlying query and return the result as an arrow table
    # This is where do_get service is invoked
    def _to_arrow_internal(self, timeout: Optional[int] = None) -> pa.Table:
        upload_descriptor = pa.flight.FlightDescriptor.for_command(self.command)
        flight = self.client.get_flight_info(upload_descriptor)
        ticket = flight.endpoints[0].ticket

        reader = self.client.do_get(ticket)
        return reader.read_all()

    @property
    def on_demand_feature_views(self) -> List[OnDemandFeatureView]:
        return []


class RemoteOfflineStore(OfflineStore):
    def __init__(
        self,
        arrow_host,
        arrow_port
    ):
        self.arrow_host = arrow_host
        self.arrow_port = arrow_port

    @log_exceptions_and_usage(offline_store="remote")
    def get_historical_features(
        self,
        config: RepoConfig,
        feature_views: List[FeatureView],
        feature_refs: List[str],
        entity_df: Union[pd.DataFrame, str],
        registry: Registry = None,
        project: str = '',
        full_feature_names: bool = False,
    ) -> RemoteRetrievalJob:
        return RemoteRetrievalJob(self.arrow_host, self.arrow_port, feature_refs, entity_df)

## Testing the get_historical_features API

Create the `RemoteOfflineStore` instance to interact with the Arrow Flight server

In [6]:
arrow_host="0.0.0.0"
arrow_port=8815
offlineStore = RemoteOfflineStore(arrow_host=arrow_host, arrow_port=arrow_port)

Verify there are no pending flights before we begin

In [7]:
test_client = pa.flight.connect(f"grpc://{arrow_host}:{arrow_port}")
flights = test_client.list_flights()
size = len(list(flights))
assert size == 0, f"Found {size} existing flights, instead of none"

Invoke the `get_historical_features` API on the `OfflineStore` implementation

In [12]:
%%time
import pandas as pd

# Create the features list
historical_df = pd.DataFrame()
features = [f"mnist:feature_{i+1}" for i in range(28)]
features.append("mnist:number")

# Fetch in chuncks of 5000 records
batch_size = 5000
offset = 0
while offset < len(entity_df):
    end_index = min(len(entity_df), offset + batch_size)
    print(f"Fetching rows from {offset} to {end_index}")
    batch_entity_df = pd.DataFrame.from_dict(
        {
            "image_id": entity_df['image_id'][offset: end_index],
            "event_timestamp": entity_df['event_timestamp'][offset: end_index],
        }
    )

    offset += batch_size
    # TODO adjust feature_refs and fetch feature_views from repo
    retrievalJob = offlineStore.get_historical_features(
        config=None,
        feature_views=[],
        entity_df=batch_entity_df,
        feature_refs=features,
    )
    
    training_df = retrievalJob.to_df()
    historical_df = pd.concat([historical_df, training_df], ignore_index=True)
    # break

historical_df.head()

Fetching rows from 0 to 5000
Fetching rows from 5000 to 10000
Fetching rows from 10000 to 15000
Fetching rows from 15000 to 20000
Fetching rows from 20000 to 25000
Fetching rows from 25000 to 30000
Fetching rows from 30000 to 35000
Fetching rows from 35000 to 40000
Fetching rows from 40000 to 45000
Fetching rows from 45000 to 50000
Fetching rows from 50000 to 55000
Fetching rows from 55000 to 60000
Fetching rows from 60000 to 65000
Fetching rows from 65000 to 70000
CPU times: user 1.4 s, sys: 362 ms, total: 1.77 s
Wall time: 18.3 s


Unnamed: 0,image_id,event_timestamp,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,number
0,2,2016-05-11 21:14:44,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",...,"(0.0,0.0,0.0,0.0,0.0,0.4117647058823529,0.9882...","(0.0,0.0,0.0,0.0,0.0,0.9058823529411765,0.9882...","(0.0,0.0,0.0,0.0,0.0,0.8117647058823529,0.9882...","(0.0,0.0,0.0,0.0,0.0,0.050980392156862744,0.36...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",2
1,1,2016-05-12 00:14:44,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",...,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",1
2,1,2016-05-12 06:14:44,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",...,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",1
3,7,2016-05-12 07:14:44,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",...,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1568627...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6470588...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16862745098...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.53725490196...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26274509803...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",7
4,6,2016-05-12 10:14:44,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",...,"(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2627450...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0980392...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...","(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...",6


In [9]:
assert len(historical_df) == 70000, f"Found {len(historical_df)} instead of 70000"

Verify there are no pending flights after the test

In [10]:
flights = test_client.list_flights()
size = len(list(flights))
assert size == 0, f"Found {size} existing flights, instead of none"

### Additional validations

In [11]:
actions = test_client.list_actions()
assert len(actions) == 0, f"Found {len(actions)} existing flights, instead of none"