# dataset
Generate dataset for the recommendation system plroblem

In [5]:
!python3 dataset_gen.py

Users sample:
   user_id  age gender signup_date          preferences
0        1   56      F  2023-09-28  Home,Books,Clothing
1        2   36      M  2023-03-29          Electronics
2        3   19      F  2023-10-21       Clothing,Books
3        4   39      M  2023-12-11          Electronics
4        5   45  Other  2023-09-28    Electronics,Books

Items sample:
   item_id  category  subcategory  ...  new_arrival  on_sale        arrival_date
0        1    Sports  Team Sports  ...        False    False 2023-03-13 04:48:00
1        2      Home      Kitchen  ...        False     True 2023-08-31 09:14:00
2        3     Books      Science  ...        False    False 2023-01-02 12:40:00
3        4  Clothing  Accessories  ...        False    False 2023-05-08 06:41:00
4        5    Sports  Team Sports  ...        False    False 2023-05-27 08:28:00

[5 rows x 10 columns]

Interactions sample:
   interaction_id  user_id  item_id  ... interaction_type rating  quantity
0               1      382   

# Setup Feature Store

We run the `feast apply` command to register the data

In [6]:
!cd feature_repo/ ; feast plan

[1m[94mNo changes to registry
[1m[94mNo changes to infrastructure


In [7]:
!cd feature_repo/ ; feast apply 

[1m[94mNo changes to registry
[1m[94mNo changes to infrastructure


In [8]:
from feast import FeatureStore
from datetime import datetime, timedelta

store = FeatureStore(repo_path="feature_repo/")

# Generating datasets using Feast

In [9]:
from feast import FeatureService
import pandas as pd
from itertools import product
# load feature services
item_service = store.get_feature_service("item_service")
user_service = store.get_feature_service("user_service")
interaction_service = store.get_feature_service("interaction_service")

user_ids = list(range(1, 1_000))
item_ids = list(range(1, 5_000))

# select which entities to use
item_entity_df = pd.DataFrame.from_dict(
    {
        'item_id': item_ids,
        'timestamp': [datetime(2025, 1, 1)] * len(item_ids) 
    }
)
user_entity_df = pd.DataFrame.from_dict(
    {
        'user_id': user_ids,
        'timestamp': [datetime(2025, 1, 1)] * len(user_ids) 
    }
)
item_user_interactions_df = pd.read_parquet('./feature_repo/data/interactions_item_user_ids.parquet')
item_user_interactions_df['timestamp'] = datetime(2025, 1, 1)

# retrive datasets for training
item_df = store.get_historical_features(entity_df=item_entity_df, features=item_service).to_df()
user_df = store.get_historical_features(entity_df=user_entity_df, features=user_service).to_df()
interaction_df = store.get_historical_features(entity_df=item_user_interactions_df, features=interaction_service).to_df()

Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.


# Training

In [None]:
from models import ItemEncoder, UserEncoder, TwoTowerModel, train_two_tower
dim = 512

item_encoder = ItemEncoder(dim)
user_encoder = UserEncoder(dim)
two_tower_model = TwoTowerModel(item_encoder=item_encoder, user_encoder=user_encoder)
train_two_tower(two_tower_model, item_df, user_df, interaction_df)

# Batch scoring
Encode the items and users vector representation

In [None]:
embeded_items = item_encoder(item_df)
item_df['embedding'] = embeded_items
embeded_users = user_encoder(user_df)
user_df['embedding'] = embeded_users

# Push the new embedding to the offline and online store
store.push('user_embed_push_source', item_df)
store.push('item_embed_push_source', user_df)
# store.push('user_embed_push_source', item_df[['item_id', 'embedding']])
# store.push('item_embed_push_source', user_df[['user_id', 'embedding']])

# Materialize
Materialization generates the latest values for each entity key in the online store and creates a time-based index to enhance retrieval speed.
The `materialize-incremental `command materializes the offline store initially and, on subsequent runs, ingests only new data and updates the store.

In [None]:
store.materialize_incremental(datetime.now() - timedelta(days=365 * 5), feature_views=['item_embedding', 'user_embedding'])

# Inferencing

## Existing User Case

## New User Case


In [None]:
from feast import FeatureStore

store = FeatureStore(repo_path=".")

import pandas as pd

# Get the latest feature values for unique entities
entity_df = pd.DataFrame.from_dict({"driver_id": [1001, 1002, 1003, 1004, 1005],})
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
training_df = store.get_historical_features(
    entity_df=entity_df, features=store.get_feature_service("model_v2"),
).to_df()

# Make batch predictions
# predictions = model.predict(training_df)
print(training_df)
