In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('diabetes.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
predictors_df = data.loc[:,data.columns!='Outcome']
taregt_df = data['Outcome']

In [8]:

timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(data), 
    freq='D').to_frame(name="event_timestamp", index=False)

In [9]:

predictors_df = pd.concat(objs=[predictors_df, timestamps], axis=1)
taregt_df = pd.concat(objs=[taregt_df, timestamps], axis=1)

In [12]:
patient_ids = pd.DataFrame(data=list(range(len(data))), columns=["patient_id"])

predictors_df = pd.concat(objs=[predictors_df, patient_ids], axis=1)
taregt_df = pd.concat(objs=[taregt_df, patient_ids], axis=1)

In [13]:
predictors_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,event_timestamp,patient_id
0,6,148,72,35,0,33.6,0.627,50,2020-05-20 20:20:23.701972,0
1,1,85,66,29,0,26.6,0.351,31,2020-05-21 20:20:23.701972,1
2,8,183,64,0,0,23.3,0.672,32,2020-05-22 20:20:23.701972,2
3,1,89,66,23,94,28.1,0.167,21,2020-05-23 20:20:23.701972,3
4,0,137,40,35,168,43.1,2.288,33,2020-05-24 20:20:23.701972,4
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,2022-06-22 20:20:23.701972,763
764,2,122,70,27,0,36.8,0.340,27,2022-06-23 20:20:23.701972,764
765,5,121,72,23,112,26.2,0.245,30,2022-06-24 20:20:23.701972,765
766,1,126,60,0,0,30.1,0.349,47,2022-06-25 20:20:23.701972,766


In [14]:
taregt_df.head()

Unnamed: 0,Outcome,event_timestamp,patient_id
0,1,2020-05-20 20:20:23.701972,0
1,0,2020-05-21 20:20:23.701972,1
2,1,2020-05-22 20:20:23.701972,2
3,0,2020-05-23 20:20:23.701972,3
4,1,2020-05-24 20:20:23.701972,4


In [15]:
predictors_df.to_parquet(path='predictors_df.parquet')
taregt_df.to_parquet(path='target_df.parquet')

In [16]:
cd diabetes_pred_repo

C:\Users\Ashutosh Tripathi\Documents\mlops\diabetes_pred_repo


In [None]:
# init -m diabetes_pred_repo # -m to create an empty repo

In [24]:
!feast apply

Created entity patient_id
Created feature view predictors_df_feature_view
Created feature view target_feature_view

Created sqlite table diabetes_pred_repo_predictors_df_feature_view
Created sqlite table diabetes_pred_repo_target_feature_view





### Generating Training Data Set

In [None]:
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage

# Getting our FeatureStore
store = FeatureStore(repo_path=".")

# Reading our targets as an entity DataFrame
entity_df = pd.read_parquet(path="data/target_df.parquet")    

# Getting the indicated historical features
# and joining them with our entity DataFrame
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "predictors_df_feature_view:Pregnancies",
        "predictors_df_feature_view:Glucose",
        "predictors_df_feature_view:BloodPressure",
        "predictors_df_feature_view:SkinThickness",
        "predictors_df_feature_view:Insulin",
        "predictors_df_feature_view:BMI",
        "predictors_df_feature_view:DiabetesPedigreeFunction",
        "predictors_df_feature_view:Age",        
    ]
)

# Storing the dataset as a local file
dataset = store.create_saved_dataset(
    from_=training_data,
    name="breast_cancer_dataset",
    storage=SavedDatasetFileStorage("breast_cancer/data/breast_cancer_dataset.parquet")
)