In [1]:
!feast init iris_project



Creating a new Feast repository in [1m[32m/home/jupyter/iris_project[0m.



In [2]:
import pandas as pd
from datetime import datetime

# Load iris dataset
df = pd.read_csv("dataset/iris.csv")


df["event_timestamp"] = datetime.utcnow()  
df["flower_id"] = df.index 


output_path = "iris_project/feature_repo/data"
import os
os.makedirs(output_path, exist_ok=True)
df.to_parquet(f"{output_path}/iris_data.parquet", index=False)

print("iris_data.parquet saved successfully!")


iris_data.parquet saved successfully!


In [3]:
from datetime import timedelta
from feast import Entity, FeatureView, Field, FileSource
from feast.value_type import ValueType  
from feast.types import Float32  

# 1. Entity definition
flower = Entity(
    name="flower_id",
    join_keys=["flower_id"],
    value_type=ValueType.INT64  
)

# 2. Source: offline store location (parquet file)
iris_source = FileSource(
    path="data/iris_data.parquet",
    timestamp_field="event_timestamp"
)

# 3. Feature View
iris_fv = FeatureView(
    name="iris_features",
    entities=[flower],
    ttl=timedelta(days=1),
    schema=[
        Field(name="sepal_length", dtype=Float32),
        Field(name="sepal_width", dtype=Float32),
        Field(name="petal_length", dtype=Float32),
        Field(name="petal_width", dtype=Float32),
    ],
    online=True,
    source=iris_source,
)




In [4]:
from feast import FeatureStore

# Initialize the store
store = FeatureStore(repo_path="iris_project/feature_repo")

# Apply the definitions
store.apply([flower, iris_fv])




In [5]:
from datetime import datetime

# Materialize features from past to now
store.materialize(
    start_date=datetime(2025, 1, 1),
    end_date=datetime.utcnow()
)


Materializing [1m[32m1[0m feature views from [1m[32m2025-01-01 00:00:00+00:00[0m to [1m[32m2025-06-22 17:56:19+00:00[0m into the [1m[32msqlite[0m online store.

[1m[32miris_features[0m:


100%|███████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 7175.80it/s]


In [6]:
# Online retrieval of features for a specific flower_id
online_features = store.get_online_features(
    features=[
        "iris_features:sepal_length",
        "iris_features:sepal_width",
        "iris_features:petal_length",
        "iris_features:petal_width",
    ],
    entity_rows=[{"flower_id": 1}]  # Try 1 through 150
).to_df()

print(online_features)


   flower_id  sepal_width  petal_length  sepal_length  petal_width
0          1          3.0           1.4           4.9          0.2


In [7]:
training_df = store.get_historical_features(
    entity_df=df,  # DataFrame with flower_id + timestamp
    features=[
        "iris_features:sepal_length",
        "iris_features:sepal_width",
        "iris_features:petal_length",
        "iris_features:petal_width",
    ],
).to_df()

print(training_df.head())


   sepal_length  sepal_width  petal_length  petal_width     species  \
0           5.1          3.5           1.4          0.2      setosa   
1           5.7          2.9           4.2          1.3  versicolor   
2           6.2          2.9           4.3          1.3  versicolor   
3           5.1          2.5           3.0          1.1  versicolor   
4           5.7          2.8           4.1          1.3  versicolor   

                   event_timestamp  flower_id  sepal_length__  sepal_width__  \
0 2025-06-22 17:54:44.168951+00:00          0             5.1            3.5   
1 2025-06-22 17:54:44.168951+00:00         96             5.7            2.9   
2 2025-06-22 17:54:44.168951+00:00         97             6.2            2.9   
3 2025-06-22 17:54:44.168951+00:00         98             5.1            2.5   
4 2025-06-22 17:54:44.168951+00:00         99             5.7            2.8   

   petal_length__  petal_width__  
0             1.4            0.2  
1             4.2     

In [8]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


df = pd.read_csv("dataset/iris.csv")
df["flower_id"] = df.index  # 


entity_rows = [{"flower_id": int(i)} for i in df["flower_id"]]

feature_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
online_features = store.get_online_features(
    features=[f"iris_features:{f}" for f in feature_names],
    entity_rows=entity_rows
).to_df()


feast_data = online_features.merge(df[["flower_id", "species"]], on="flower_id")

# 4. Split into training and testing sets
X = feast_data[feature_names]
y = feast_data["species"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)

# 5. Train Decision Tree model
model = DecisionTreeClassifier(max_depth=3, random_state=1)
model.fit(X_train, y_train)

# 6. Predict and evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f" Accuracy using Feast features: {accuracy:.4f}")
print("\n Classification Report:")
print(classification_report(y_test, predictions))


 Accuracy using Feast features: 0.9833

 Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        20
  versicolor       1.00      0.95      0.97        20
   virginica       0.95      1.00      0.98        20

    accuracy                           0.98        60
   macro avg       0.98      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60



In [9]:
import joblib
import os

os.makedirs("artifacts", exist_ok=True)

# Save the trained model
joblib.dump(model, "artifacts/model.joblib")

print(" Model saved to artifacts/model.joblib")


 Model saved to artifacts/model.joblib


In [10]:
import joblib
from sklearn.metrics import accuracy_score

model = joblib.load("artifacts/model.joblib")


feature_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"]


In [11]:
data = df.copy()
data = data.reset_index(drop=True)
data["flower_id"] = data.index
entity_rows = [{"flower_id": int(fid)} for fid in data["flower_id"]]


In [12]:
from feast import FeatureStore

store = FeatureStore(repo_path="iris_project/feature_repo")

online_features = store.get_online_features(
    features=[f"iris_features:{f}" for f in feature_names],
    entity_rows=entity_rows
).to_df()




In [13]:
# Define the correct feature order
ordered_feature_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# Drop non-feature columns and reorder features
X_online = online_features[ordered_feature_names]

# Predict
y_pred = model.predict(X_online)

# True labels (assumes same order as feature rows)
y_true = data["species"]

# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true, y_pred)
print(f" Accuracy using online features: {accuracy:.4f}")


 Accuracy using online features: 0.9800


In [14]:
from google.cloud import storage


bucket_name = "week-3_assignment_feast"  # Change this to your GCS bucket
destination_blob_name = "models/iris_decision_tree.joblib"
source_file_name = "artifacts/model.joblib"

# Initialize client
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

# Upload the model
blob.upload_from_filename(source_file_name)

print(f" Uploaded {source_file_name} to gs://{bucket_name}/{destination_blob_name}")


 Uploaded artifacts/model.joblib to gs://week-3_assignment_feast/models/iris_decision_tree.joblib
