In [2]:
!pip install --upgrade pip
!pip install -q -r requirements.txt



## Prerequisites
From previous notebook at [feast_showcase_notebook](../feast_showcase_notebook/):
* Install PSQL DB 
* Install Feature server

## Collect MNIST data

MNIST contains a collection of 70,000, 28 x 28 images of handwritten digits from 0 to 9.
1. Since Feast does not support storage of list of lists, we model each image as a list of 28 features, each one containing a list of 28 values.
1. Feast allows users to build a training dataset from time-series feature data, so we need to add an arbitrary ts to each row

Load MNIST dataset and normalize it as float from 0 to 1

In [3]:
from tensorflow import keras
(x_train,y_train),(x_test,y_test) = keras.datasets.mnist.load_data()

x_train = x_train/255
x_test = x_test/255

print(f"x_train: {x_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_test: {y_test.shape}")
assert len(x_train)==60000
assert len(y_train)==60000
assert len(x_test)==10000
assert len(y_test)==10000

2024-04-10 20:00:51.986690: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


x_train: (60000, 28, 28)
y_train: (60000,)
x_test: (10000, 28, 28)
y_test: (10000,)


## Prepare data

Aggregate training and test data

In [4]:
import numpy as np

x_data = np.concatenate((x_train, x_test))
y_data = np.concatenate((y_train, y_test))
print(f"x_data: {x_data.shape}")
print(f"y_data: {y_data.shape}")
assert len(x_data)==70000
assert len(y_data)==70000

x_data: (70000, 28, 28)
y_data: (70000,)


Generate DataFrame with image_id from 1 to data length and the given prediction labels (y_data)

In [5]:
import pandas as pd
#y_data must be converted to list
image_ids = pd.DataFrame({'image_id': list(range(1, len(x_data) + 1)), 'number': y_data.tolist()})

assert len(image_ids)==70000
assert image_ids['image_id'].iloc[0]==1, f"First value is not 1 but {image_ids['image_id'].iloc[0]}"
assert image_ids['image_id'].iloc[69999]==70000, f"First value is not 70000 but {image_ids['image_id'].iloc[69999]}"
image_ids.head()


Unnamed: 0,image_id,number
0,1,5
1,2,0
2,3,4
3,4,1
4,5,9


Generate arbitrary timestamp for all the feature rows in the dataset.
All the features have the same `ts`.

In [6]:
from datetime import datetime, timedelta
ts = datetime.now().replace(microsecond=0, second=0, minute=30, hour=12)
# Randomly pick a ts from 10 days ago for testing
ts = ts - timedelta(days=10)
print(f'Selected timestamp is {ts}')

Selected timestamp is 2024-03-31 12:30:00


In [7]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# timestamps = pd.date_range(
#     end=pd.Timestamp.now().replace(microsecond=0, second=0, minute=0, hour=0), 
#     periods=len(x_data), 
#     freq='10min').to_frame(name="ts", index=False)
timestamps = pd.DataFrame(data=[ts] * 70000, columns=["ts"])
assert len(timestamps)==70000
timestamps.head()

Unnamed: 0,ts
0,2024-03-31 12:30:00
1,2024-03-31 12:30:00
2,2024-03-31 12:30:00
3,2024-03-31 12:30:00
4,2024-03-31 12:30:00


In [8]:
# creations = pd.date_range(
#     end=pd.Timestamp.now().replace(microsecond=0, second=0, minute=0, hour=0), 
#     periods=len(x_data), 
#     freq='10min').to_frame(name="created", index=False)
creations = pd.DataFrame(data=[ts] * 70000, columns=["created"])
assert len(creations)==70000
creations.head()

Unnamed: 0,created
0,2024-03-31 12:30:00
1,2024-03-31 12:30:00
2,2024-03-31 12:30:00
3,2024-03-31 12:30:00
4,2024-03-31 12:30:00


Add timestamp column

In [9]:
images = pd.concat(objs=[image_ids, timestamps, creations], axis=1)
images.head()

Unnamed: 0,image_id,number,ts,created
0,1,5,2024-03-31 12:30:00,2024-03-31 12:30:00
1,2,0,2024-03-31 12:30:00,2024-03-31 12:30:00
2,3,4,2024-03-31 12:30:00,2024-03-31 12:30:00
3,4,1,2024-03-31 12:30:00,2024-03-31 12:30:00
4,5,9,2024-03-31 12:30:00,2024-03-31 12:30:00


Collect features from original dataset by reverting rows and columns

In [10]:
for feature_id in range(28):
    feature = [x_data[image_id][feature_id] for image_id in range(len(x_data))]
    images[f"feature_{feature_id+1}"] = feature

In [11]:
images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   image_id    70000 non-null  int64         
 1   number      70000 non-null  int64         
 2   ts          70000 non-null  datetime64[ns]
 3   created     70000 non-null  datetime64[ns]
 4   feature_1   70000 non-null  object        
 5   feature_2   70000 non-null  object        
 6   feature_3   70000 non-null  object        
 7   feature_4   70000 non-null  object        
 8   feature_5   70000 non-null  object        
 9   feature_6   70000 non-null  object        
 10  feature_7   70000 non-null  object        
 11  feature_8   70000 non-null  object        
 12  feature_9   70000 non-null  object        
 13  feature_10  70000 non-null  object        
 14  feature_11  70000 non-null  object        
 15  feature_12  70000 non-null  object        
 16  feature_13  70000 non-

In [12]:
images.head()

Unnamed: 0,image_id,number,ts,created,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28
0,1,5,2024-03-31 12:30:00,2024-03-31 12:30:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.090...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0705882352941...","[0.0, 0.0, 0.0, 0.0, 0.21568627450980393, 0.67...","[0.0, 0.0, 0.0, 0.0, 0.5333333333333333, 0.992...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,0,2024-03-31 12:30:00,2024-03-31 12:30:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3372549019607...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1098039215686...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09803921...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,4,2024-03-31 12:30:00,2024-03-31 12:30:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,1,2024-03-31 12:30:00,2024-03-31 12:30:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.188...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.250...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.250...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.094...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,5,9,2024-03-31 12:30:00,2024-03-31 12:30:00,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Persist data in PSQL DB

In [13]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

psqlHost = 'postgresql.feast.svc.cluster.local'
psqlPort = 5432
psqlUsername = 'feast'
psqlPassword = 'feast'
psqlDb = 'feast'
psqlSchema = 'feast'

mnistTableName = 'mnist_source'

In [14]:
from psycopg2.extensions import register_adapter, AsIs

def addapt_numpy_array(numpy_array):
    return AsIs(tuple(numpy_array))

register_adapter(np.ndarray, addapt_numpy_array)


**Set this variable to `true` to skip the DB storage in case the table is already populated**

In [15]:
%env SKIP_IMAGE_COPY=false

env: SKIP_IMAGE_COPY=false


In [22]:
import pandas as pd
import os
from sqlalchemy import create_engine

if os.environ['SKIP_IMAGE_COPY'] == 'true':
    print("Skipping storage as per configured variable SKIP_IMAGE_COPY")
else:
    def persist(from_idx, to_idx):
        print(f"Persist images from {from_idx} to {to_idx}")
        data = images[from_idx: min(to_idx, len(images))]
        size = len(data)
        
        batch_size = 5000
        for i in range(0, size, batch_size):
            engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
            chunk = data.iloc[i:i+batch_size]
            print(f"Persist chunk from {from_idx+i} to {from_idx+(i+batch_size)}")
            chunk.to_sql(mnistTableName, engine, if_exists='append', index=True)
    
    engine = create_engine(f'postgresql+psycopg2://{psqlUsername}:{psqlPassword}@{psqlHost}:{str(psqlPort)}/{psqlDb}')
    with engine.connect() as conn:
        conn.execute(f'DROP TABLE IF EXISTS {mnistTableName}')

    block_size = 10000
    for i in range(int(len(images)/block_size)):
        persist(i*block_size, (i+1) * block_size)


Persist images from 0 to 10000
Persist chunk from 0 to 5000
Persist chunk from 5000 to 10000
Persist images from 10000 to 20000
Persist chunk from 10000 to 15000
Persist chunk from 15000 to 20000
Persist images from 20000 to 30000
Persist chunk from 20000 to 25000
Persist chunk from 25000 to 30000
Persist images from 30000 to 40000
Persist chunk from 30000 to 35000
Persist chunk from 35000 to 40000
Persist images from 40000 to 50000
Persist chunk from 40000 to 45000
Persist chunk from 45000 to 50000
Persist images from 50000 to 60000
Persist chunk from 50000 to 55000
Persist chunk from 55000 to 60000
Persist images from 60000 to 70000
Persist chunk from 60000 to 65000
Persist chunk from 65000 to 70000
