In [9]:
import os
from datetime import datetime

import pandas as pd
from feast import FeatureStore

In [35]:
raw_data_path = os.path.join("feature_store", "feature_repo", "data", "fraud_snap.parquet")
feature_store_path = os.path.join("feature_store", "feature_repo")

### Check data

In [36]:
df = pd.read_parquet(raw_data_path)

In [37]:
df.head(5)

Unnamed: 0,transaction_id,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,event_timestamp
0,754390469,580415,482,14.62,41591913,481,0,0,2020-12-15
1,850147586,714424,609,4.76,46881604,542,0,0,2021-02-14
2,99398570,456435,97,6.79,5480127,63,0,0,2019-10-24
3,1855892968,745104,474,50.32,102332522,1184,0,0,2022-11-18
4,988815484,239221,980,13.5,54561150,631,0,0,2021-05-14


In [13]:
df.shape

(100000, 9)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   transaction_id     100000 non-null  int32         
 1   customer_id        100000 non-null  int32         
 2   terminal_id        100000 non-null  int32         
 3   tx_amount          100000 non-null  float32       
 4   tx_time_seconds    100000 non-null  int32         
 5   tx_time_days       100000 non-null  int32         
 6   tx_fraud           100000 non-null  int32         
 7   tx_fraud_scenario  100000 non-null  int32         
 8   event_timestamp    100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(1), int32(7)
memory usage: 3.8 MB


### Features inference

In [62]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "customer_id": [580415, 456435],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2020, 12, 15),
            datetime(2019, 10, 24)
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 2],
    }
)

# Преобразуем типы после создания DataFrame
entity_df["customer_id"] = entity_df["customer_id"].astype("int32")
entity_df["label_driver_reported_satisfaction"] = entity_df["label_driver_reported_satisfaction"].astype("int8")

In [63]:
entity_df

Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction
0,580415,2020-12-15,1
1,456435,2019-10-24,2


In [64]:
entity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   customer_id                         2 non-null      int32         
 1   event_timestamp                     2 non-null      datetime64[ns]
 2   label_driver_reported_satisfaction  2 non-null      int8          
dtypes: datetime64[ns](1), int32(1), int8(1)
memory usage: 158.0 bytes


In [103]:
store = FeatureStore(repo_path=feature_store_path)

In [66]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_daily_transaction:transaction_id",
        "driver_daily_transaction:terminal_id",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())



----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   customer_id                         2 non-null      int32              
 1   event_timestamp                     2 non-null      datetime64[ns, UTC]
 2   label_driver_reported_satisfaction  2 non-null      int8               
 3   transaction_id                      2 non-null      int32              
 4   terminal_id                         2 non-null      int32              
dtypes: datetime64[ns, UTC](1), int32(3), int8(1)
memory usage: 174.0 bytes
None


In [67]:
training_df.head()

Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction,transaction_id,terminal_id
0,456435,2019-10-24 00:00:00+00:00,2,99398570,97
1,580415,2020-12-15 00:00:00+00:00,1,754390469,482


### Features View - on demand

In [86]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "customer_id": [580415, 456435],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2020, 12, 15),
            datetime(2019, 10, 24)
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 2],
        # values we're using for an on-demand transformation
        "val_to_add": [1, 2],
        "val_to_add_2": [10, 30],
    }
)

# Преобразуем типы после создания DataFrame
entity_df["customer_id"] = entity_df["customer_id"].astype("int32")

In [87]:
entity_df.head(5)

Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2
0,580415,2020-12-15,1,1,10
1,456435,2019-10-24,2,2,30


In [94]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_daily_transaction:transaction_id",
        "driver_daily_transaction:terminal_id",
        "driver_daily_transaction:tx_amount",
        "driver_daily_transaction:tx_fraud",
        "driver_daily_transaction:tx_fraud_scenario",
        "compute_avg_tx_amount_last_7d:avg_tx_amount_last_7d",
    ],
).to_df()



In [95]:
training_df

Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,transaction_id,terminal_id,tx_amount,tx_fraud,tx_fraud_scenario,avg_tx_amount_last_7d
0,456435,2019-10-24 00:00:00+00:00,2,2,30,99398570,97,6.79,0,0,
1,580415,2020-12-15 00:00:00+00:00,1,1,10,754390469,482,14.62,0,0,


Перед тем как обратиться за онлайн-фичами, необходимо их перелить из офлайн-хранилища в онлайн-хранилище.
Это можно сделать инкрементно с помощью функции `poetry run feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S")` - это зальет данные с момента последней материализации до текущего времени. Или по дипазаону дат например: `poetry run feast materialize 2020-01-01T00:00:00 2020-12-31T23:59:59`.

In [105]:
# Online feature retrieval
online_features = store.get_online_features(
    features=[
        "driver_daily_transaction:transaction_id",
        "driver_daily_transaction:terminal_id",
        "driver_daily_transaction:tx_amount",
    ],
    entity_rows=[
        {"customer_id": 580415}, 
        {"customer_id": 456435}
    ],
).to_dict()

print("Online features for cutomers 580415, 456435:")
for key, value in online_features.items():
    print(f"{key}: {value}")



Online features for cutomers 580415, 456435:
customer_id: [580415, 456435]
tx_amount: [14.619999885559082, None]
terminal_id: [482, None]
transaction_id: [754390469, None]


In [115]:
feature_service = store.get_feature_service("daily_transaction_v1")

# Получаем список фич из FeatureService
features = [f.name for fv in feature_service.feature_view_projections for f in fv.features]

print(features)

['terminal_id', 'transaction_id', 'tx_fraud_plus_val1', 'tx_fraud_scenario_plus_val2']


In [116]:
# Using Feature Service for consistent feature sets
training_df_v1 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("transaction_activity_v2")
).to_df()

print("\nFeatures from daily_transaction_v1 service:")
training_df_v1.head()




Features from daily_transaction_v1 service:


Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,transaction_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_fraud_plus_val1,tx_fraud_scenario_plus_val2
0,456435,2019-10-24 00:00:00+00:00,2,2,30,99398570,97,6.79,5480127,63,0,0,2,30
1,580415,2020-12-15 00:00:00+00:00,1,1,10,754390469,482,14.62,41591913,481,0,0,1,10


In [118]:
# Get feature view metadata
feature_view = store.get_feature_view("driver_daily_transaction")
print("\nFeature view metadata:")
print(f"Name: {feature_view.name}")
print(f"Entities: {feature_view.entities}")
print(f"TTL: {feature_view.ttl}")
print(f"Online: {feature_view.online}")
print(f"Features: {[f.name for f in feature_view.features]}")


Feature view metadata:
Name: driver_daily_transaction
Entities: ['customer_id']
TTL: 30 days, 0:00:00
Online: True
Features: ['transaction_id', 'terminal_id', 'tx_amount', 'tx_time_seconds', 'tx_time_days', 'tx_fraud', 'tx_fraud_scenario']


In [119]:
# Using Feature Service for consistent feature sets
training_df_v4 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("transaction_activity_v4")
).to_df()

print("\nFeatures from driver_activity_v4 service:")
training_df_v4.head()




Features from driver_activity_v4 service:


Unnamed: 0,customer_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,transaction_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,combined_time
0,456435,2019-10-24 00:00:00+00:00,2,2,30,99398570,97,6.79,5480127,63,0,0,2192088.6
1,580415,2020-12-15 00:00:00+00:00,1,1,10,754390469,482,14.62,41591913,481,0,0,16637053.8
