In [19]:
!pip install --upgrade pip
!pip install -q -r requirements.txt



## Prerequisites
From previous notebook at [feast_showcase_notebook](../feast_showcase_notebook/):
* Install PSQL DB
    * Update the deployment configuration to extend the memory limit to `1Gi`
* Install Feature server

## Collect MNIST data

MNIST contains a collection of 70,000, 28 x 28 images of handwritten digits from 0 to 9.
1. Since Feast does not support storage of list of lists, we model each image as a list of 28 features, each one containing a list of 28 values.
1. Feast allows users to build a training dataset from time-series feature data, so we need to add an arbitrary ts to each row
1. We use the number associated to each image as the entity key, e.g. the `image_id` field, so that every new image can have multiple
values/images associated with it over time


Load MNIST dataset and normalize it as float from 0 to 1

In [36]:
from tensorflow import keras
(x_train,y_train),(x_test,y_test) = keras.datasets.mnist.load_data()

x_train = x_train/255
x_test = x_test/255

print(f"x_train: {x_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_test: {y_test.shape}")
assert len(x_train)==60000
assert len(y_train)==60000
assert len(x_test)==10000
assert len(y_test)==10000

x_train: (60000, 28, 28)
y_train: (60000,)
x_test: (10000, 28, 28)
y_test: (10000,)


## Data preparation

Aggregate training and test data

In [21]:
import numpy as np

x_data = np.concatenate((x_train, x_test))
y_data = np.concatenate((y_train, y_test))
print(f"x_data: {x_data.shape}")
print(f"y_data: {y_data.shape}")
assert len(x_data)==70000
assert len(y_data)==70000

x_data: (70000, 28, 28)
y_data: (70000,)


Generate DataFrame with image_id from 0 to 9, equal to the prediction score, and the prediction label `number` (y_data)

In [22]:
import pandas as pd
#y_data must be converted to list
image_ids = pd.DataFrame({'image_id': y_data.tolist(), 'number': y_data.tolist()})

assert len(image_ids)==70000

Generate arbitrary timestamp for all the feature rows in the dataset.
All the features have the same `ts`.

In [38]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
from datetime import datetime
# Create time series: one entry every 10s, up to now
timestamps = pd.date_range(
    end=pd.Timestamp.now().replace(microsecond=0), 
    periods=len(x_data), 
    freq='1H').to_frame(name="ts", index=False)

timestamps['created'] = timestamps['ts']
print(f"Generated time series from {timestamps['ts'].min()} to {timestamps['ts'].max()}")
assert len(timestamps)==70000
timestamps.head()

Generated time series from 2016-04-24 15:35:13 to 2024-04-19 06:35:13


Unnamed: 0,ts,created
0,2016-04-24 15:35:13,2016-04-24 15:35:13
1,2016-04-24 16:35:13,2016-04-24 16:35:13
2,2016-04-24 17:35:13,2016-04-24 17:35:13
3,2016-04-24 18:35:13,2016-04-24 18:35:13
4,2016-04-24 19:35:13,2016-04-24 19:35:13


Add timestamp column

In [39]:
images = pd.concat(objs=[image_ids, timestamps], axis=1)
images.head()

Unnamed: 0,image_id,number,ts,created
0,5,5,2016-04-24 15:35:13,2016-04-24 15:35:13
1,0,0,2016-04-24 16:35:13,2016-04-24 16:35:13
2,4,4,2016-04-24 17:35:13,2016-04-24 17:35:13
3,1,1,2016-04-24 18:35:13,2016-04-24 18:35:13
4,9,9,2016-04-24 19:35:13,2016-04-24 19:35:13


Collect features from original dataset by reverting rows and columns

In [40]:
for feature_id in range(28):
    feature = [x_data[image_id][feature_id] for image_id in range(len(x_data))]
    images[f"feature_{feature_id+1}"] = feature

In [41]:
images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   image_id    70000 non-null  int64         
 1   number      70000 non-null  int64         
 2   ts          70000 non-null  datetime64[ns]
 3   created     70000 non-null  datetime64[ns]
 4   feature_1   70000 non-null  object        
 5   feature_2   70000 non-null  object        
 6   feature_3   70000 non-null  object        
 7   feature_4   70000 non-null  object        
 8   feature_5   70000 non-null  object        
 9   feature_6   70000 non-null  object        
 10  feature_7   70000 non-null  object        
 11  feature_8   70000 non-null  object        
 12  feature_9   70000 non-null  object        
 13  feature_10  70000 non-null  object        
 14  feature_11  70000 non-null  object        
 15  feature_12  70000 non-null  object        
 16  feature_13  70000 non-

In [42]:
images.head()

Unnamed: 0,image_id,number,ts,created,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28
0,5,5,2016-04-24 15:35:13,2016-04-24 15:35:13,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.090...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0705882352941...","[0.0, 0.0, 0.0, 0.0, 0.21568627450980393, 0.67...","[0.0, 0.0, 0.0, 0.0, 0.5333333333333333, 0.992...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,0,2016-04-24 16:35:13,2016-04-24 16:35:13,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3372549019607...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1098039215686...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09803921...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,4,2016-04-24 17:35:13,2016-04-24 17:35:13,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,1,2016-04-24 18:35:13,2016-04-24 18:35:13,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.188...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.250...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.250...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.094...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,9,9,2016-04-24 19:35:13,2016-04-24 19:35:13,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Persist data in PSQL DB

In [43]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'
mnistPushTableName = 'mnist_push_source'

In [44]:
from psycopg2.extensions import register_adapter, AsIs

def addapt_numpy_array(numpy_array):
    return AsIs(tuple(numpy_array))

register_adapter(np.ndarray, addapt_numpy_array)


**Set this variable to `true` to skip the DB storage in case the table is already populated**

In [45]:
%env SKIP_IMAGE_COPY=false

env: SKIP_IMAGE_COPY=false


In [46]:
import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy.exc import ProgrammingError

if os.environ['SKIP_IMAGE_COPY'] == 'true':
    print("Skipping storage as per configured variable SKIP_IMAGE_COPY")
else:
    def persist(from_idx, to_idx):
        print(f"Persist images from {from_idx} to {to_idx}")
        data = images[from_idx: min(to_idx, len(images))]
        size = len(data)
        
        batch_size = 5000
        for i in range(0, size, batch_size):
            engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
            chunk = data.iloc[i:i+batch_size]
            print(f"Persist chunk from {from_idx+i} to {from_idx+(i+batch_size)}")
            chunk.to_sql(mnistTableName, engine, if_exists='append', index=True, schema=psqlSchema)
    
    engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
    with engine.connect() as conn:
        try:
            conn.execute(f'CREATE SCHEMA {psqlSchema}')
        except ProgrammingError as e:
            print(f"An error occurred while creating schema {psqlSchema}: {e}")
        conn.execute(f'DROP TABLE IF EXISTS {mnistTableName}')

    block_size = 10000
    for i in range(int(len(images)/block_size)):
        persist(i*block_size, (i+1) * block_size)


An error occurred while creating schema feast: (psycopg2.errors.DuplicateSchema) schema "feast" already exists

[SQL: CREATE SCHEMA feast]
(Background on this error at: https://sqlalche.me/e/14/f405)
Persist images from 0 to 10000
Persist chunk from 0 to 5000
Persist chunk from 5000 to 10000
Persist images from 10000 to 20000
Persist chunk from 10000 to 15000
Persist chunk from 15000 to 20000
Persist images from 20000 to 30000
Persist chunk from 20000 to 25000
Persist chunk from 25000 to 30000
Persist images from 30000 to 40000
Persist chunk from 30000 to 35000
Persist chunk from 35000 to 40000
Persist images from 40000 to 50000
Persist chunk from 40000 to 45000
Persist chunk from 45000 to 50000
Persist images from 50000 to 60000
Persist chunk from 50000 to 55000
Persist chunk from 55000 to 60000
Persist images from 60000 to 70000
Persist chunk from 60000 to 65000
Persist chunk from 65000 to 70000


Verify row count

In [47]:
from sqlalchemy import create_engine

engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
row_count = engine.execute(f"SELECT COUNT(*) FROM {mnistTableName}").scalar()

assert row_count==70000, f"Row count is not 70000 but {row_count}"
print(f"Persisted {row_count} in {mnistTableName}")

Persisted 70000 in mnist_source


## Create table for push data source

This step is requested to setup the Feast repository.

Create an empty DataFrame for the online store.

In [33]:
online_df = images.copy()
online_df.drop(columns=['number'], inplace=True)
online_df.drop(online_df.index, inplace=True)

assert len(online_df) == 0
online_df.head()

Unnamed: 0,image_id,ts,created,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28


In [34]:
online_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   image_id    0 non-null      int64         
 1   ts          0 non-null      datetime64[ns]
 2   created     0 non-null      datetime64[ns]
 3   feature_1   0 non-null      object        
 4   feature_2   0 non-null      object        
 5   feature_3   0 non-null      object        
 6   feature_4   0 non-null      object        
 7   feature_5   0 non-null      object        
 8   feature_6   0 non-null      object        
 9   feature_7   0 non-null      object        
 10  feature_8   0 non-null      object        
 11  feature_9   0 non-null      object        
 12  feature_10  0 non-null      object        
 13  feature_11  0 non-null      object        
 14  feature_12  0 non-null      object        
 15  feature_13  0 non-null      object        
 16  feature_14  0 non-null      object    

In [35]:
engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
online_df.to_sql(mnistPushTableName, engine, if_exists='append', index=True, schema=psqlSchema)

0