# Prepare data set and store in parquet format

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [3]:

#target and features variables
predictors_df = data.loc[:,data.columns!='Outcome']
target_df = data['Outcome']
predictors_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


# Create timestamp to be added as event_timestamp column in the data set.

In [4]:
timestamps = pd.date_range(end = pd.Timestamp.now(),
                           periods = len(data),freq = 'D').to_frame(name = 'event_timestamp', index = False)
timestamps

Unnamed: 0,event_timestamp
0,2021-05-08 14:31:28.830917
1,2021-05-09 14:31:28.830917
2,2021-05-10 14:31:28.830917
3,2021-05-11 14:31:28.830917
4,2021-05-12 14:31:28.830917
...,...
762,2023-06-09 14:31:28.830917
763,2023-06-10 14:31:28.830917
764,2023-06-11 14:31:28.830917
765,2023-06-12 14:31:28.830917


# add event_timestamp column to the predictors and target dataframes

In [5]:
predictors_df = pd.concat(objs = [predictors_df, timestamps], axis = 1)
target_df = pd.concat(objs = [target_df, timestamps], axis =1)
predictors_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp
0,1,85,66,29,0,26.6,0.351,31,2021-05-08 14:31:28.830917
1,8,183,64,0,0,23.3,0.672,32,2021-05-09 14:31:28.830917
2,1,89,66,23,94,28.1,0.167,21,2021-05-10 14:31:28.830917
3,0,137,40,35,168,43.1,2.288,33,2021-05-11 14:31:28.830917
4,5,116,74,0,0,25.6,0.201,30,2021-05-12 14:31:28.830917


In [6]:
target_df.head()

Unnamed: 0,Outcome,event_timestamp
0,0,2021-05-08 14:31:28.830917
1,1,2021-05-09 14:31:28.830917
2,0,2021-05-10 14:31:28.830917
3,1,2021-05-11 14:31:28.830917
4,0,2021-05-12 14:31:28.830917


# Create a patientID column to uniquely identify records with patientID and timestamp field together.

In [8]:
dataLen = len(data)
idsList = list(range(dataLen))
#idsList


In [9]:
#idsList
idsList = list(range(dataLen))



In [17]:
#patient_ids to uniquely identify the record
patient_ids = pd.DataFrame(data = idsList, columns = ['patient_id'])
patient_ids

Unnamed: 0,patient_id
0,0
1,1
2,2
3,3
4,4
...,...
762,762
763,763
764,764
765,765


In [11]:
#concat the patient id to both the df i.e target and fetures
predictors_df = pd.concat(objs = [predictors_df, patient_ids], axis = 1)
target_df = pd.concat(objs = [target_df, patient_ids], axis =1)

In [12]:
predictors_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp,patient_id
0,1,85,66,29,0,26.6,0.351,31,2021-05-08 14:31:28.830917,0
1,8,183,64,0,0,23.3,0.672,32,2021-05-09 14:31:28.830917,1
2,1,89,66,23,94,28.1,0.167,21,2021-05-10 14:31:28.830917,2
3,0,137,40,35,168,43.1,2.288,33,2021-05-11 14:31:28.830917,3
4,5,116,74,0,0,25.6,0.201,30,2021-05-12 14:31:28.830917,4


In [13]:
predictors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Pregnancies               767 non-null    int64         
 1   Glucose                   767 non-null    int64         
 2   BloodPressure             767 non-null    int64         
 3   SkinThickness             767 non-null    int64         
 4   Insulin                   767 non-null    int64         
 5   BMI                       767 non-null    float64       
 6   DiabetesPedigreeFunction  767 non-null    float64       
 7   Age                       767 non-null    int64         
 8   event_timestamp           767 non-null    datetime64[ns]
 9   patient_id                767 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(7)
memory usage: 60.0 KB


In [14]:
#converting the df into parquet
predictors_df.to_parquet(path='predictors_df.parquet')
target_df.to_parquet(path='target_df.parquet')

In [15]:
!pip install feast


Collecting feast
  Downloading feast-0.31.1-py2.py3-none-any.whl (4.6 MB)
     ---------------------------------------- 0.0/4.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/4.6 MB 991.0 kB/s eta 0:00:05
     -- ------------------------------------- 0.2/4.6 MB 3.0 MB/s eta 0:00:02
     --- ------------------------------------ 0.4/4.6 MB 2.9 MB/s eta 0:00:02
     ----- ---------------------------------- 0.6/4.6 MB 3.1 MB/s eta 0:00:02
     ------ --------------------------------- 0.8/4.6 MB 3.4 MB/s eta 0:00:02
     -------- ------------------------------- 0.9/4.6 MB 3.5 MB/s eta 0:00:02
     --------- ------------------------------ 1.1/4.6 MB 3.4 MB/s eta 0:00:02
     ----------- ---------------------------- 1.3/4.6 MB 3.4 MB/s eta 0:00:01
     ------------ --------------------------- 1.4/4.6 MB 3.3 MB/s eta 0:00:01
     -------------- ------------------------- 1.6/4.6 MB 3.4 MB/s eta 0:00:01
     --------------- ------------------------ 1.8/4.6 MB 3.5 MB/s eta 0:0

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\sairam\\anaconda3\\envs\\mlops\\Lib\\site-packages\\~-ndas\\_libs\\algos.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [25]:
!feast version

Feast SDK Version: "feast 0.31.1"


# Do feast init

In [19]:
#this is option as it creates the feast repo directory structure. you can create a directory using mkdir and inside that create
#a feature_Store.yaml file and a feature_definitions.py file. but its better to use feast init and then modify the respective files.



In [26]:
!feast init feature_repo


Creating a new Feast repository in C:\Users\sairam\Data Science\feature store\feature_repo.



06/13/2023 03:26:39 PM root INFO: copying C:\Users\sairam\anaconda3\envs\mlops\lib\site-packages\feast\templates\local\bootstrap.py -> C:\Users\sairam\Data Science\feature store\feature_repo
06/13/2023 03:26:39 PM root INFO: creating C:\Users\sairam\Data Science\feature store\feature_repo\feature_repo
06/13/2023 03:26:39 PM root INFO: copying C:\Users\sairam\anaconda3\envs\mlops\lib\site-packages\feast\templates\local\feature_repo\example_repo.py -> C:\Users\sairam\Data Science\feature store\feature_repo\feature_repo
06/13/2023 03:26:39 PM root INFO: copying C:\Users\sairam\anaconda3\envs\mlops\lib\site-packages\feast\templates\local\feature_repo\feature_store.yaml -> C:\Users\sairam\Data Science\feature store\feature_repo\feature_repo
06/13/2023 03:26:39 PM root INFO: copying C:\Users\sairam\anaconda3\envs\mlops\lib\site-packages\feast\templates\local\feature_repo\test_workflow.py -> C:\Users\sairam\Data Science\feature store\feature_repo\feature_repo
06/13/2023 03:26:39 PM root INFO:

# Update feature store yaml file if needed

#you can update the online store and local store paths in feature_store.yaml file if needed.

# Define Feature definitions in a python file inside feature repo directory (created using feast init)
This step is known as register and deploy the features go inside the feature_repo folder in the github you will see the feature_definition.py file with updated code. modify as per to your dataset features.

# Do feast apply
do feast apply from inside the feature_repo directory

In [31]:
pwd

'C:\\Users\\sairam\\Data Science\\feature store\\feature_repo'

In [32]:
cd feature_repo

C:\Users\sairam\Data Science\feature store\feature_repo\feature_repo


In [33]:
pwd


'C:\\Users\\sairam\\Data Science\\feature store\\feature_repo\\feature_repo'

In [37]:
!feast apply

Created entity patient_id
Created feature view target_df_feature_view
Created feature view predictors_df_feature_view

Created sqlite table feature_repo_predictors_df_feature_view
Created sqlite table feature_repo_target_df_feature_view



# Generate Training Data Set

In [38]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

store = FeatureStore(repo_path='.')

entity_df = pd.read_parquet(path = 'data/target_df.parquet')

training_data = store.get_historical_features(
entity_df = entity_df,
    features = [
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",
               ]
)

dataset = store.create_saved_dataset(
from_=training_data,
    name = "diabetes_dataset",
    storage = SavedDatasetFileStorage('data/diabetes_dataset.parquet')
)



In [39]:
training_data.to_df()

Unnamed: 0,Outcome,event_timestamp,patient_id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,2021-05-08 14:31:28.830917+00:00,0,1,85,66,29,0,26.6,0.351,31
1,1,2021-05-09 14:31:28.830917+00:00,1,8,183,64,0,0,23.3,0.672,32
2,0,2021-05-10 14:31:28.830917+00:00,2,1,89,66,23,94,28.1,0.167,21
3,1,2021-05-11 14:31:28.830917+00:00,3,0,137,40,35,168,43.1,2.288,33
4,0,2021-05-12 14:31:28.830917+00:00,4,5,116,74,0,0,25.6,0.201,30
...,...,...,...,...,...,...,...,...,...,...,...
762,0,2023-06-09 14:31:28.830917+00:00,762,10,101,76,48,180,32.9,0.171,63
763,0,2023-06-10 14:31:28.830917+00:00,763,2,122,70,27,0,36.8,0.340,27
764,0,2023-06-11 14:31:28.830917+00:00,764,5,121,72,23,112,26.2,0.245,30
765,1,2023-06-12 14:31:28.830917+00:00,765,1,126,60,0,0,30.1,0.349,47


# Model Training

In [40]:
# Importing dependencies
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Retrieving the saved dataset and converting it to a DataFrame
training_df = store.get_saved_dataset(name="diabetes_dataset").to_df()

# Separating the features and labels
y = training_df['Outcome']
X = training_df.drop(
    labels=['Outcome', 'event_timestamp', "patient_id"], 
    axis=1)

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

# Creating and training LogisticRegression
reg = LogisticRegression()
reg.fit(X=X_train[sorted(X_train)], y=y_train)

# Saving the model
dump(value=reg, filename="model.joblib")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['model.joblib']

# Prepare online feature store
(Loading the features to online store)

There are two ways you can use to load features to your online store

materialize
materialize loads the latest features between two dates.

feast materialize 2020–01–01T00:00:00 2022–01–01T00:00:00

materialize-incremental
materialize-incremental loads features up to the provided end date:

feast materialize-incremental 2022–01–01T00:00:00

In [41]:
# Importing dependencies
from feast import FeatureStore
from datetime import datetime, timedelta

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

store.materialize_incremental(end_date = datetime.now())

Materializing [1m[32m2[0m feature views to [1m[32m2023-06-13 18:15:03+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mtarget_df_feature_view[0m from [1m[32m2023-06-11 12:45:03+05:30[0m to [1m[32m2023-06-13 18:15:03+05:30[0m:


100%|████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 164.25it/s]


[1m[32mpredictors_df_feature_view[0m from [1m[32m2023-06-11 12:45:03+05:30[0m to [1m[32m2023-06-13 23:45:03+05:30[0m:


100%|████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.18it/s]


# Get online features for prediction

In [47]:
# Importing dependencies
from feast import FeatureStore
import pandas as pd
from joblib import load

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Defining our features names
feast_features = [
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",
    ]

# Getting the latest features
features = store.get_online_features(
    features=feast_features,    
    entity_rows=[{"patient_id": 765}, {"patient_id": 765}]
).to_dict()

# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=features)

In [48]:
features_df.head()

Unnamed: 0,patient_id,BloodPressure,Pregnancies,Age,Insulin,Glucose,DiabetesPedigreeFunction,BMI,SkinThickness
0,765,60,1,47,0,126,0.349,30.1,0
1,765,60,1,47,0,126,0.349,30.1,0


# Call the predict function and see the output

In [49]:
# Loading our model and doing inference
reg = load("model.joblib")
predictions = reg.predict(features_df[sorted(features_df.drop("patient_id", axis=1))])
print(predictions)

[0 0]
