In [2]:
import os
import sys

sys.path.append("..")

import re
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import sqlalchemy
from sqlalchemy import func, select, desc
from sqlalchemy.sql.expression import and_, or_

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

import config
import cyclops
from cyclops.orm import Database
from cyclops.queries import query_gemini_delirium_diagnosis, query_gemini_delirium_lab
from cyclops.processors.diagnosis_codes import DiagnosisProcessor
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    DIAGNOSIS_CODE,
    ADMIT_TIMESTAMP,
    LAB_TEST_RESULT_VALUE,
    LAB_TEST_TIMESTAMP,
    LAB_TEST_NAME,
    LAB_TEST_RESULT_UNIT,
    REFERENCE_RANGE,
    HOSPITAL_ID,
    AGE,
)
from cyclops.processors.labs import LabsProcessor
from cyclops.processors.feature_handler import FeatureHandler

import cyclops.query_utils as q_utils

import cyclops.query_mimic as qm

%load_ext autoreload
%autoreload 2 # Load when external files are updated
%load_ext nb_black

2022-03-30 10:04:06,170 [1;37mINFO[0m config          - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,202 [1;37mINFO[0m cyclops.utils.profile - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,203 [1;37mINFO[0m cyclops.query_utils - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,205 [1;37mINFO[0m cyclops.orm     - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,207 [1;37mINFO[0m cyclops.processors.base - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,208 [1;37mINFO[0m cyclops.processors.diagnosis_codes - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,210 [1;37mINFO[0m cyclops.processors.labs - Log file is /home/kmckeen/gemini/cyclops/log.log
2022-03-30 10:04:06,234 [1;37mINFO[0m cyclops.processors.feature_handler - Log file is /home/kmckeen/gemini/cyclops/log.log


<IPython.core.display.Javascript object>

# Setup ORM

In [3]:
os.environ["USER"] = "postgres"
os.environ["PGPASSWORD"] = "pwd"
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

2022-03-30 10:04:07,107 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


<IPython.core.display.Javascript object>

MIMIC EDA
See sample_code/delirium.ipynb

In sample_code/data_layer.ipynb:
from evidently.dashboard.tabs import DataQualityTab

# Feature extraction

## Target extraction

In [4]:
query = qm.patient_diagnoses(db, include_icd_title=True)
db.run_query(query, limit=10)

2022-03-30 10:04:25,200 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-30 10:04:25,201 [1;37mINFO[0m cyclops.utils.profile - Finished executing function wrapper_func in 18.066507 s


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,icd_title
0,10427677,23641430,1,20,9,Typhoid fever
1,13455052,29627788,6,20,9,Typhoid fever
2,12916672,24528914,1,20,9,Typhoid fever
3,16992055,23051025,1,20,9,Typhoid fever
4,17476472,21398715,1,20,9,Typhoid fever
5,11088311,27128981,1,20,9,Typhoid fever
6,10158684,22072764,1,20,9,Typhoid fever
7,11816842,25448380,1,20,9,Typhoid fever
8,12051958,25585946,1,29,9,"Paratyphoid fever, unspecified"
9,11545281,28569779,1,30,9,Salmonella gastroenteritis


<IPython.core.display.Javascript object>

In [5]:
# Get all patient diagnoses with substring delirium
subquery = qm.patient_diagnoses_by_substring(db, "delirium").subquery()

# Include patient statics
query = qm.join_with_patients(db, subquery)

# Run the query
delirium_codes_df = db.run_query(query)
delirium_codes_df

2022-03-30 10:04:26,739 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-30 10:04:26,740 [1;37mINFO[0m cyclops.utils.profile - Finished executing function wrapper_func in 1.504623 s


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,icd_title,icd_title_1,subject_id_1,gender,anchor_age,anchor_year,dod,anchor_year_group_start,anchor_year_group_end,year,anchor_year_difference
0,18439835,24904769,6,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,18439835,F,62,2157,,2008,2010,2009,-148
1,13420749,21955018,4,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,13420749,F,80,2126,,2008,2010,2009,-117
2,15347749,22166444,2,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,15347749,M,72,2114,,2011,2013,2012,-102
3,12474382,24623202,3,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,12474382,M,72,2164,,2008,2010,2009,-155
4,10404324,29231726,2,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,10404324,F,81,2138,,2008,2010,2009,-129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9082,19195373,28814854,4,F19921,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",19195373,F,25,2132,,2017,2019,2018,-114
9083,11402251,24504249,7,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",11402251,F,45,2156,,2017,2019,2018,-138
9084,11793360,26278511,6,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",11793360,M,60,2144,,2011,2013,2012,-132
9085,11717909,25077908,14,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",11717909,M,30,2129,,2014,2016,2015,-114


<IPython.core.display.Javascript object>

In [6]:
# Get matrix of pat2 ient IDs, hadm (hospital stay) IDs
# of people diagnosed with delirium. These people will
# be the ones with label 1 in the final target matrix
target_df = delirium_codes_df[["subject_id", "hadm_id"]].drop_duplicates().to_numpy()
target_df

array([[18439835, 24904769],
       [13420749, 21955018],
       [15347749, 22166444],
       ...,
       [11793360, 26278511],
       [11717909, 25077908],
       [18496919, 29510882]])

<IPython.core.display.Javascript object>

## Input extraction

In [7]:
from cyclops.processors.column_names import ENCOUNTER_ID, DIAGNOSIS_CODE

<IPython.core.display.Javascript object>

In [8]:
subquery = qm.patient_diagnoses(db, version=10, include_icd_title=False).subquery()
query = q_utils.rename_attributes(
    subquery, {"hadm_id": ENCOUNTER_ID, "icd_code": DIAGNOSIS_CODE}
)
data = db.run_query(query)
data

2022-03-30 10:04:32,470 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-30 10:04:32,471 [1;37mINFO[0m cyclops.utils.profile - Finished executing function wrapper_func in 5.683654 s


Unnamed: 0,subject_id,encounter_id,seq_num,diagnosis_code,icd_version
0,11810816,27862869,6,Z3A39,10
1,18335503,21596415,2,E861,10
2,18335503,21596415,5,O99512,10
3,18335503,21596415,6,J45909,10
4,18335503,21596415,1,O99612,10
...,...,...,...,...,...
2189976,18947849,27742627,3,O700,10
2189977,19405804,21016241,4,Z370,10
2189978,19405804,21016241,1,O4413,10
2189979,19405804,21016241,2,O6981X0,10


<IPython.core.display.Javascript object>

In [9]:
must_have_columns = [ENCOUNTER_ID, DIAGNOSIS_CODE]
data_diagnosis = data[must_have_columns]

diagnosis_processor = DiagnosisProcessor(data_diagnosis, must_have_columns)
diagnosis_features = diagnosis_processor.process()
diagnosis_features

2022-03-30 10:04:32,570 [1;37mINFO[0m cyclops.processors.base - Processing raw diagnosis codes...
2022-03-30 10:04:32,586 [1;37mINFO[0m cyclops.processors.base - # samples: 2189981, # encounters: 185743
2022-03-30 10:05:28,832 [1;37mINFO[0m cyclops.processors.base - Converting diagnosis codes to ICD codes...
2022-03-30 10:05:28,849 [1;37mINFO[0m cyclops.processors.base - # samples: 2189981, # encounters: 185743
2022-03-30 10:05:28,968 [1;37mINFO[0m cyclops.processors.diagnosis_codes - # diagnosis features: 22, # encounters: 185743
2022-03-30 10:06:59,224 [1;37mINFO[0m cyclops.utils.profile - Finished executing function process in 146.653847 s


Unnamed: 0,Unnamed: 1,E00_E89,O00_O99,J00_J99,R00_R99,A00_B99,Z00_Z99,P00_P96,N00_N99,M00_M99,...,F01_F99,G00_G99,C00_D49,L00_L99,V00_Y99,K00_K95,H00_H59,S00_T88,Q00_Q99,H60_H95
27862869,1,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
21596415,0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21283853,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20659410,1,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20789372,1,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28721266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
29566765,0,0,0,0,0,0,1,0,1,0,...,0,1,1,0,0,0,0,0,0,0
27197374,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25594844,0,1,0,1,1,0,1,0,1,0,...,1,0,0,0,1,0,0,0,1,0


<IPython.core.display.Javascript object>

## Get Delirium Target

In [13]:
# Get delirium diagnoses
target_query = qm.patient_diagnoses_by_substring(db, "delirium").subquery()

# Join to get patient statics
target_query = qm.join_with_patients(db, target_query)

target_df = db.run_query(target_query)
target_df

2022-03-30 10:13:38,766 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-30 10:13:38,767 [1;37mINFO[0m cyclops.utils.profile - Finished executing function wrapper_func in 1.429173 s


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,icd_title,icd_title_1,subject_id_1,gender,anchor_age,anchor_year,dod,anchor_year_group_start,anchor_year_group_end,year,anchor_year_difference
0,10404324,29231726,2,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,10404324,F,81,2138,,2008,2010,2009,-129
1,18439835,24904769,6,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,18439835,F,62,2157,,2008,2010,2009,-148
2,12471922,21556724,5,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,12471922,F,58,2183,,2011,2013,2012,-171
3,15794450,21327955,2,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,15794450,M,67,2156,2159-12-30,2008,2010,2009,-147
4,17963447,24001386,10,29011,9,Presenile dementia with delirium,Presenile dementia with delirium,17963447,M,79,2199,,2014,2016,2015,-184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9082,15827938,27642031,7,F19921,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",15827938,M,65,2116,,2017,2019,2018,-98
9083,11402251,24504249,7,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",11402251,F,45,2156,,2017,2019,2018,-138
9084,11717909,25077908,14,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",11717909,M,30,2129,,2014,2016,2015,-114
9085,18496919,29510882,1,F19931,10,"Other psychoactive substance use, unspecified ...","Other psychoactive substance use, unspecified ...",18496919,F,75,2116,,2017,2019,2018,-98


<IPython.core.display.Javascript object>

In [14]:
# Get matrix of patient IDs, hadm (hospital stay) IDs
sub_hadm_target = target_df[["subject_id", "hadm_id"]].drop_duplicates().to_numpy()
sub_hadm_target

array([[10404324, 29231726],
       [18439835, 24904769],
       [12471922, 21556724],
       ...,
       [11717909, 25077908],
       [18496919, 29510882],
       [11793360, 26278511]])

<IPython.core.display.Javascript object>

In [15]:
sub_hadm_target.shape

(8975, 2)

<IPython.core.display.Javascript object>

In [17]:
# Check to make sure none of the codes overlap with the delirium codes
target_set = set(target_df["icd_code"].values)
features_set = set(diagnosis_features)
assert len(target_set.intersection(features_set)) == 0

<IPython.core.display.Javascript object>

## Get Features

In [18]:
from cyclops.processors.feature_handler import FeatureHandler

<IPython.core.display.Javascript object>

### Baseline handling class

In [19]:
# Keep in here for now

from sklearn.model_selection import train_test_split


# Regression
from sklearn.linear_model import LinearRegression


# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC


# Both
from sklearn.neural_network import MLPClassifier


class BaselineHandler:
    def __init__(self, feature_handler):
        self.fh = feature_handler

    def run(self, verbose=True):
        features = self.fh.features_scaled
        targets = self.fh.targets_scaled

        X = features.values
        y_all = targets.values
        target_types = np.array(fh.types)[np.array(fh.is_target)]
        target_names = np.array(fh.names)[np.array(fh.is_target)]

        X_train, X_val, y_train_all, y_val_all = train_test_split(
            X, y_all, test_size=0.2, shuffle=True
        )

        for i in range(len(targets.columns)):
            y_train = y_train_all[:, i]
            y_val = y_val_all[:, i]
            target_type = target_types[i]
            target_name = target_names[i]

            if verbose:
                print("===", target_name, "===")

            if target_type == "binary":
                for Model in [LogisticRegression, SVC, LinearSVC, MLPClassifier]:
                    model = Model()
                    model.fit(X_train, y_train)

                    if verbose:
                        print(type(model).__name__, ":", model.score(X_val, y_val))

            if target_type == "numeric":
                for Model in [LinearRegression, MLPClassifier]:
                    model = Model()
                    model.fit(X_train, y_train)

                    if verbose:
                        print(type(model).__name__, ":", model.score(X_val, y_val))

<IPython.core.display.Javascript object>

In [22]:
arr_numeric = np.array(
    [[1.5, 5.2, 10], [23, 10.4, 9], [3.9, 15.1, 8], [4.9, 20, 7], [5, 25.5, 6]]
)
arr_numeric_string = np.array([["1"], ["2"], ["3"], ["4"], ["5"]])
arr_features = np.concatenate([arr_numeric, arr_numeric_string], axis=1)

arr_targets = np.array([1, 0, 1, 1, 0])

fh = FeatureHandler()
fh.add_features(pd.DataFrame(arr_features))
fh.add_features(pd.DataFrame(arr_targets), is_target=True)

TypeError: add_features() got an unexpected keyword argument 'is_target'

<IPython.core.display.Javascript object>

In [None]:
fh.df_unscaled

In [None]:
bh = BaselineHandler(fh)
bh.run()

## Binary ICD features

In [None]:
icd_feature_query = patient_diagnoses_by_icd_codes(diagnosis_features, version='10').subquery()

query = select(icd_feature_query, patient_anchors).where( \
    icd_feature_query.c.subject_id == patient_anchors.c.subject_id)

features_df = db.run_query(query)
features_df

In [None]:
features_df = features_df[[
    'subject_id',
    'hadm_id',
    'icd_code',
    'gender',
    'anchor_age',
    'anchor_year',
    'year'
]]

In [None]:
# Group the data by patients and their hospital visits
features_df_grouped = features_df.groupby(["subject_id", "hadm_id"])

# USE A GROUPBY APPLY INSTEAD OF THIS
features_binary = np.zeros((len(features_df_grouped), len(categories_distinct)))
sub_hadm_features = np.zeros((len(features_df_grouped), 2), dtype=np.int64)
count = 0
for key, item in features_df_grouped:
    vals = item.icd_code.str.strip().values
    groups = np.unique(code_to_category_vec(vals))
    sub_hadm_features[count] = list(key)
    features_binary[count][category_to_index_vec(groups)] = 1
    count += 1

In [None]:
# Get counts from each feature
counts = features_binary.sum(axis=0)
for i, c in enumerate(categories_distinct):
    print(c, "count:", counts[i])

### Static Features

In [None]:
grouped_firsts = features_df_grouped.first().reset_index()
static_feature_names = np.array(['gender', 'anchor_age'])
static_features = grouped_firsts[static_feature_names].to_numpy()
static_features.shape

In [None]:
static_features[0, :]

In [None]:
features = FeatureHandler()

In [None]:
features.add_features(features_binary, names=categories_distinct)

In [None]:
features.add_features(static_features, names=static_feature_names)

In [None]:
features.names

### AGE BROKEN - FIX

NOTE: I still think some of the ages shouldn't be 0 when they are.

This age doesn't actually correspond to the hospital visit... need to sort this out. Where is the approx year in the hospital table?

Get target

In [None]:
def multidim_intersect(arr1, arr2):
    intersected = set(map(tuple, arr1)).intersection(set(map(tuple, arr2)))  
    return np.array(list(intersected))

# Find common patient/visit between target and features
inters = multidim_intersect(sub_hadm_features, sub_hadm_target)
inds = np.where((sub_hadm_features == inters[:,None]).all(-1))[1]

## Dataset Prep

In [None]:
f = features.df_scaled.values

In [None]:
# Get samples with the target being true
features_with_target = f[inds]

# Get samples with the target being false
mask = np.ones(f.shape[0], dtype=bool)
mask[inds] = False
features_without_target = f[mask]

# Take the minimum number of samples from each
num = min([features_with_target.shape[0], features_without_target.shape[0]])

print(features_with_target.shape)
print(features_without_target.shape)

features_with_target = features_with_target[:num]
features_without_target = features_without_target[:num]
X = np.concatenate([features_with_target, features_without_target])
y = np.zeros(2*num, dtype=bool)
y[:num] = 1

Oversampling /Undersampling
Oversampling class with fewer, undersample class with more
Sklearn - class balance
Resamples the dataset such that, in a batch, you'll have roughly the same number per batch

Metrics: Precision, recall, F1, accuracy, confusion matrix, ROC curve/aROC
Trade off: No balancing = predict 0s only,

Sensitivity vs. Specificity

Create a couple functions (using sklearn resample)
Undersampling only, oversampling only, both, etc.


Class weighting - weight loss according to class balance? Later on perhaps

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( \
    X, y, test_size=0.33, shuffle=True, random_state=42)

#X_test, X_val, y_test, y_val = train_test_split( \
#    X_test, y_test, test_size=0.5, shuffle=True, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Evaluative Metrics

In [None]:
from sklearn.metrics import roc_curve

def plot_roc_curve(model):
    # predict probabilities
    yhat = model.predict_proba(testX)
    # keep probabilities for the positive outcome only
    yhat = yhat[:, 1]
    # calculate roc curves
    fpr, tpr, thresholds = roc_curve(testy, yhat)
    # plot the roc curve for the model
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label='Logistic')

    # Plot
    plt.datalabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

#model.score
#model.predict_proba

# Baselines

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
# Get coefficients
coef = reg.coef_[0]
# summarize feature importance
for i,v in enumerate(coef):
	print("{} score: {}".format(features.names[i], np.round(v,3)))
# plot feature importance
plt.bar([x for x in range(len(coef))], coef)
plt.show()

## SVM

Try using min-max normalization vs standardization to see difference in results

Add None, 'standardize', 'min-max' as options for standardization

### Non-linear

In [None]:
from sklearn.svm import SVC

svc = SVC(tol=1e-5)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

### Linear

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(tol=1e-5)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

In [None]:
# Get coefficients
coef = svc.coef_[0]
# summarize feature importance
for i,v in enumerate(coef):
	print("{} score: {}".format(features.names[i], np.round(v,3)))
# plot feature importance
plt.bar([x for x in range(len(coef))], coef)
plt.show()

## MLP

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation="relu",
    alpha=0.001,
    learning_rate="adaptive",
    max_iter=500,
    validation_fraction=0.2,
    early_stopping=True,
)

acc = []
for train_indices, test_indices in kf.split(X_train):
    clf.fit(X_train[train_indices], y_train[train_indices])
    acc.append(clf.score(X_train[test_indices], y_train[test_indices]))
    print("Train loss:", clf.loss_)
    print("Val acc:", acc[-1])


# mlp.fit(X, y)
# plt.plot(mlp.loss_curve_, color='blue')
# plt.plot(mlp.validation_scores_, color='orange')

# print("Train Acc:", mlp.score_)
# print("Val Acc:", best_validation_score_)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', alpha=0.001, max_iter=500)
mlp.fit(X_train, y_train)
plt.plot(mlp.loss_curve_)

print("Train Acc:", mlp.score(X_train, y_train))

y_train_pred = mlp.predict_proba(X_train)
train_acc = (np.argmax(y_train_pred, axis=1) == y_train).sum()/y_train.shape[0]
print("Train Acc:", train_acc)

y_test_pred = mlp.predict_proba(X_test)
test_acc = (np.argmax(y_test_pred, axis=1) == y_test).sum()/y_test.shape[0]
print("Test Acc:", test_acc)

This is a decently high accuracy... maybe just check to make sure none of the ICD codes we're using are any of the target values? Just as a sanity check!

Write function to find intersection of target ICD codes and feature ICD codes

Label leaking: Some feature which encodes something about the output we wouldn't actually have in the wild - it's cheating


Start to check out the lab data and think of adding features from it

Dealing with NaNs, remove for now
Impute? Imputations methods - a project later on, for sure!

For vitals data, nearest neighbours type of thing
Aggregate over multiple tests, potentially



# Feature Store

Introduction: https://docs.feast.dev/

Quickstart: https://docs.feast.dev/getting-started/quickstart

Quickstart Colab: https://colab.research.google.com/github/feast-dev/feast/blob/master/examples/quickstart/quickstart.ipynb

Feature Repository: https://docs.feast.dev/reference/feature-repository



In [None]:
%%sh
pip install wheel
pip install feast -U -q
pip install Pygments -q
echo "Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded."

In [None]:
!feast init feature_repo

In [None]:
%cd feature_repo
!ls -R

In [None]:
!pygmentize feature_store.yaml

In [None]:
import pandas as pd
pd.read_parquet("data/driver_stats.parquet")

In [None]:
!pygmentize -f terminal16m example.py

In [None]:
!feast apply

In [None]:
from datetime import datetime, timedelta
import pandas as pd

from feast import FeatureStore

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3], 
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

print()
print("----- Example features -----\n")
print(training_df.head())

In [None]:
from datetime import datetime
!feast materialize-incremental {datetime.now().isoformat()}

In [None]:
print("--- Data directory ---")
!ls data

import sqlite3
import pandas as pd
con = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query(
        "SELECT * FROM feature_repo_driver_hourly_stats", con).columns.tolist())
con.close()

In [None]:
from pprint import pprint
from feast import FeatureStore

store = FeatureStore(repo_path=".")

feature_vector = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()

pprint(feature_vector)