In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('diabetes.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
predictors_df = data.loc[:,data.columns!='Outcome']
taregt_df = data['Outcome']

In [8]:

timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(data), 
    freq='D').to_frame(name="event_timestamp", index=False)

In [9]:

predictors_df = pd.concat(objs=[predictors_df, timestamps], axis=1)
taregt_df = pd.concat(objs=[taregt_df, timestamps], axis=1)

In [12]:
patient_ids = pd.DataFrame(data=list(range(len(data))), columns=["patient_id"])

predictors_df = pd.concat(objs=[predictors_df, patient_ids], axis=1)
taregt_df = pd.concat(objs=[taregt_df, patient_ids], axis=1)

In [13]:
predictors_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp,patient_id
0,6,148,72,35,0,33.6,0.627,50,2020-05-20 20:20:23.701972,0
1,1,85,66,29,0,26.6,0.351,31,2020-05-21 20:20:23.701972,1
2,8,183,64,0,0,23.3,0.672,32,2020-05-22 20:20:23.701972,2
3,1,89,66,23,94,28.1,0.167,21,2020-05-23 20:20:23.701972,3
4,0,137,40,35,168,43.1,2.288,33,2020-05-24 20:20:23.701972,4
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,2022-06-22 20:20:23.701972,763
764,2,122,70,27,0,36.8,0.340,27,2022-06-23 20:20:23.701972,764
765,5,121,72,23,112,26.2,0.245,30,2022-06-24 20:20:23.701972,765
766,1,126,60,0,0,30.1,0.349,47,2022-06-25 20:20:23.701972,766


In [14]:
taregt_df.head()

Unnamed: 0,Outcome,event_timestamp,patient_id
0,1,2020-05-20 20:20:23.701972,0
1,0,2020-05-21 20:20:23.701972,1
2,1,2020-05-22 20:20:23.701972,2
3,0,2020-05-23 20:20:23.701972,3
4,1,2020-05-24 20:20:23.701972,4


In [15]:
predictors_df.to_parquet(path='predictors_df.parquet')
taregt_df.to_parquet(path='target_df.parquet')

In [16]:
cd diabetes_pred_repo

C:\Users\Ashutosh Tripathi\Documents\mlops\diabetes_pred_repo


In [None]:
# init -m diabetes_pred_repo # -m to create an empty repo

In [24]:
!feast apply

Created entity patient_id
Created feature view predictors_df_feature_view
Created feature view target_feature_view

Created sqlite table diabetes_pred_repo_predictors_df_feature_view
Created sqlite table diabetes_pred_repo_target_feature_view





### Generating Training Data Set

In [26]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Reading our targets as an entity DataFrame
entity_df = pd.read_parquet(path="data/target_df.parquet")    

# Getting the indicated historical features
# and joining them with our entity DataFrame
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",        
    ]
)

# Storing the dataset as a local file
dataset = store.create_saved_dataset(
    from_=training_data,
    name="diabetes_dataset",
    storage=SavedDatasetFileStorage("data/diabetes_dataset.parquet")
)



In [27]:
training_data.to_df()

Unnamed: 0,Outcome,event_timestamp,patient_id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,2020-05-20 20:20:23.701972+00:00,0,6,148,72,35,0,33.6,0.627,50
1,0,2020-05-21 20:20:23.701972+00:00,1,1,85,66,29,0,26.6,0.351,31
2,1,2020-05-22 20:20:23.701972+00:00,2,8,183,64,0,0,23.3,0.672,32
3,0,2020-05-23 20:20:23.701972+00:00,3,1,89,66,23,94,28.1,0.167,21
4,1,2020-05-24 20:20:23.701972+00:00,4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...,...,...,...
763,0,2022-06-22 20:20:23.701972+00:00,763,10,101,76,48,180,32.9,0.171,63
764,0,2022-06-23 20:20:23.701972+00:00,764,2,122,70,27,0,36.8,0.340,27
765,0,2022-06-24 20:20:23.701972+00:00,765,5,121,72,23,112,26.2,0.245,30
766,1,2022-06-25 20:20:23.701972+00:00,766,1,126,60,0,0,30.1,0.349,47


### Training Model

In [28]:
# Importing dependencies
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Retrieving the saved dataset and converting it to a DataFrame
training_df = store.get_saved_dataset(name="diabetes_dataset").to_df()

# Separating the features and labels
y = training_df['Outcome']
X = training_df.drop(
    labels=['Outcome', 'event_timestamp', "patient_id"], 
    axis=1)

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

# Creating and training LogisticRegression
reg = LogisticRegression()
reg.fit(X=X_train[sorted(X_train)], y=y_train)

# Saving the model
dump(value=reg, filename="model.joblib")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['model.joblib']

### materialize_incremental (Load features to online store)

In [29]:
# Importing dependencies
from feast import FeatureStore
from datetime import datetime, timedelta

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Code for loading features to online store between two dates
"""store.materialize(
    end_date=datetime.now(),
    start_date=datetime.now() - timedelta(days=700))"""

# Loading the latest features after a previous materialize call or from the beginning of time
store.materialize_incremental(end_date=datetime.now())

Materializing [1m[32m2[0m feature views to [1m[32m2022-06-26 22:19:47+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mpredictors_df_feature_view[0m from [1m[32m2022-06-25 16:49:47+05:30[0m to [1m[32m2022-06-26 22:19:47+05:30[0m:


100%|████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 142.45it/s]


[1m[32mtarget_feature_view[0m from [1m[32m2022-06-25 16:49:47+05:30[0m to [1m[32m2022-06-27 03:49:47+05:30[0m:


100%|████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 287.05it/s]


### Prediction

In [37]:
# Importing dependencies
from feast import FeatureStore
import pandas as pd
from joblib import load

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Defining our features names
feast_features = [
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",
    ]

# Getting the latest features
features = store.get_online_features(
    features=feast_features,    
    entity_rows=[{"patient_id": 767}, {"patient_id": 766}]
).to_dict()

# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=features)

# Loading our model and doing inference
reg = load("model.joblib")
predictions = reg.predict(features_df[sorted(features_df.drop("patient_id", axis=1))])

In [38]:
predictions

array([0, 0], dtype=int64)