# dataset
Generate dataset for the recommendation system plroblem

In [1]:
!python3 dataset_gen.py

Users sample:
   user_id  age gender signup_date          preferences
0        1   56      F  2023-09-28  Home,Books,Clothing
1        2   36      M  2023-03-29          Electronics
2        3   19      F  2023-10-21       Clothing,Books
3        4   39      M  2023-12-11          Electronics
4        5   45  Other  2023-09-28    Electronics,Books

Items sample:
   item_id  category  subcategory  ...  new_arrival  on_sale        arrival_date
0        1    Sports  Team Sports  ...        False    False 2023-03-13 04:48:00
1        2      Home      Kitchen  ...        False     True 2023-08-31 09:14:00
2        3     Books      Science  ...        False    False 2023-01-02 12:40:00
3        4  Clothing  Accessories  ...        False    False 2023-05-08 06:41:00
4        5    Sports  Team Sports  ...        False    False 2023-05-27 08:28:00

[5 rows x 10 columns]

Interactions sample:
   interaction_id  user_id  item_id  ... interaction_type rating  quantity
0               1      382   

# Setup Feature Store

We run the `feast apply` command to register the data

In [72]:
!cd feature_repo/ ; feast plan

  DUMMY_ENTITY = Entity(
Traceback (most recent call last):
  File "/home/ikatav/miniconda3/envs/we/bin/feast", line 8, in <module>
    sys.exit(cli())
             ^^^^^
  File "/home/ikatav/miniconda3/envs/we/lib/python3.11/site-packages/click/core.py", line 1161, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ikatav/miniconda3/envs/we/lib/python3.11/site-packages/click/core.py", line 1082, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "/home/ikatav/miniconda3/envs/we/lib/python3.11/site-packages/click/core.py", line 1697, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ikatav/miniconda3/envs/we/lib/python3.11/site-packages/click/core.py", line 1443, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ikatav/miniconda3/envs/we/lib/python3.11/s

In [73]:
!cd feature_repo/ ; feast apply 

  DUMMY_ENTITY = Entity(
No project found in the repository. Using project name feast_edb_rec_sys defined in feature_store.yaml
Applying changes for project feast_edb_rec_sys
  entity = cls(
  entity = cls(
  entity = cls(
Updated feature view [1m[33muser_embedding[0m
	entities: [1m[33m['__dummy'][0m -> [1m[92m['user'][0m
	features: [1m[33m[name: "user_id"
value_type: INT64
, name: "embedding"
value_type: FLOAT_LIST
][0m -> [1m[92m[name: "embedding"
value_type: FLOAT_LIST
][0m
	entity_columns: [1m[33m[name: "__dummy_id"
value_type: STRING
][0m -> [1m[92m[name: "user_id"
value_type: INT64
][0m
Updated feature view [1m[33mitem_embedding[0m
	entities: [1m[33m['__dummy'][0m -> [1m[92m['item'][0m
	features: [1m[33m[name: "item_id"
value_type: INT64
, name: "embedding"
value_type: FLOAT_LIST
][0m -> [1m[92m[name: "embedding"
value_type: FLOAT_LIST
][0m
	entity_columns: [1m[33m[name: "__dummy_id"
value_type: STRING
][0m -> [1m[92m[name: "item_id"
value

In [74]:
from feast import FeatureStore
from datetime import datetime, timedelta

store = FeatureStore(repo_path="feature_repo/")

# Generating datasets using Feast

In [75]:
from feast import FeatureService
import pandas as pd
from itertools import product
# load feature services
item_service = store.get_feature_service("item_service")
user_service = store.get_feature_service("user_service")
interaction_service = store.get_feature_service("interaction_service")

user_ids = list(range(1, 1_000))
item_ids = list(range(1, 5_000))

# select which entities to use
item_entity_df = pd.DataFrame.from_dict(
    {
        'item_id': item_ids,
        'timestamp': [datetime(2025, 1, 1)] * len(item_ids) 
    }
)
user_entity_df = pd.DataFrame.from_dict(
    {
        'user_id': user_ids,
        'timestamp': [datetime(2025, 1, 1)] * len(user_ids) 
    }
)
item_user_interactions_df = pd.read_parquet('./feature_repo/data/interactions_item_user_ids.parquet')
item_user_interactions_df['timestamp'] = datetime(2025, 1, 1)

# retrive datasets for training
item_df = store.get_historical_features(entity_df=item_entity_df, features=item_service).to_df()
user_df = store.get_historical_features(entity_df=user_entity_df, features=user_service).to_df()
interaction_df = store.get_historical_features(entity_df=item_user_interactions_df, features=interaction_service).to_df()

Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.


# Training

In [None]:
from models import ItemEncoder, UserEncoder, TwoTowerModel, train_two_tower
dim = 512

item_encoder = ItemEncoder(dim)
user_encoder = UserEncoder(dim)
two_tower_model = TwoTowerModel(item_encoder=item_encoder, user_encoder=user_encoder)
train_two_tower(two_tower_model, item_df, user_df, interaction_df)

# Batch scoring
Encode the items and users vector representation

In [8]:
embeded_items = item_encoder(item_df)
item_df['embedding'] = embeded_items
embeded_users = user_encoder(user_df)
user_df['embedding'] = embeded_users

# Push the new embedding to the offline and online store
store.push('user_embed_push_source', item_df)
store.push('item_embed_push_source', user_df)
# store.push('user_embed_push_source', item_df[['item_id', 'embedding']])
# store.push('item_embed_push_source', user_df[['user_id', 'embedding']])

NameError: name 'item_encoder' is not defined

In [76]:
from feast.data_source import PushMode
import numpy as np
# embeded_items = item_encoder(item_df)
# embeded_users = user_encoder(user_df)
item_df['embedding'] = [[1.1, 2.2]] * len(item_df)
item_df['timestamp'] = datetime.now()
user_df['embedding'] = [[1.1, 2.2]] * len(user_df)

# Push the new embedding to the offline and online store
store.push('item_embed_push_source', item_df, to=PushMode.ONLINE_AND_OFFLINE)
store.push('user_embed_push_source', user_df, to=PushMode.ONLINE_AND_OFFLINE)
# store.push('user_embed_push_source', item_df[['item_id', 'embedding']])
# store.push('item_embed_push_source', user_df[['user_id', 'embedding']])

KeyError: 'Field "arrival_date" does not exist in schema'

In [66]:
item_df

Unnamed: 0,item_id,timestamp,category,subcategory,price,avg_rating,num_ratings,popular,new_arrival,on_sale,embedding
0,744,2025-01-01 00:00:00+00:00,Electronics,Laptops,495.84,4.9,72,False,False,False,"[1.1, 2.2]"
1,4636,2025-01-01 00:00:00+00:00,Electronics,Audio,257.67,2.1,580,True,False,False,"[1.1, 2.2]"
2,3553,2025-01-01 00:00:00+00:00,Home,Kitchen,11.10,4.6,653,True,False,False,"[1.1, 2.2]"
3,4752,2025-01-01 00:00:00+00:00,Clothing,Shoes,262.53,2.9,945,False,False,False,"[1.1, 2.2]"
4,4682,2025-01-01 00:00:00+00:00,Books,Fiction,352.68,3.6,228,False,False,False,"[1.1, 2.2]"
...,...,...,...,...,...,...,...,...,...,...,...
4994,3980,2025-01-01 00:00:00+00:00,Home,Kitchen,138.48,3.7,14,False,False,True,"[1.1, 2.2]"
4995,957,2025-01-01 00:00:00+00:00,Sports,Equipment,143.59,1.1,976,False,False,True,"[1.1, 2.2]"
4996,3939,2025-01-01 00:00:00+00:00,Books,Fiction,135.99,3.0,545,False,False,True,"[1.1, 2.2]"
4997,2689,2025-01-01 00:00:00+00:00,Home,Decor,162.31,4.3,580,False,False,False,"[1.1, 2.2]"


In [53]:
item_df['embedding']

0       [1.1, 2.2]
1       [1.1, 2.2]
2       [1.1, 2.2]
3       [1.1, 2.2]
4       [1.1, 2.2]
           ...    
4994    [1.1, 2.2]
4995    [1.1, 2.2]
4996    [1.1, 2.2]
4997    [1.1, 2.2]
4998    [1.1, 2.2]
Name: embedding, Length: 4999, dtype: object

# Materialize
Materialization generates the latest values for each entity key in the online store and creates a time-based index to enhance retrieval speed.
The `materialize-incremental `command materializes the offline store initially and, on subsequent runs, ingests only new data and updates the store.

In [None]:
store.materialize_incremental(datetime.now() - timedelta(days=365 * 5), feature_views=['item_embedding', 'user_embedding'])

# Inferencing

## Existing User Case

## New User Case


In [None]:
from feast import FeatureStore

store = FeatureStore(repo_path=".")

import pandas as pd

# Get the latest feature values for unique entities
entity_df = pd.DataFrame.from_dict({"driver_id": [1001, 1002, 1003, 1004, 1005],})
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
training_df = store.get_historical_features(
    entity_df=entity_df, features=store.get_feature_service("model_v2"),
).to_df()

# Make batch predictions
# predictions = model.predict(training_df)
print(training_df)
