# Feature Store Example

This notebook demonstrates how to use the Feature Store to register, compute, and serve features for the Mental Health Risk Assessment System.

## Setup

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from datetime import datetime

from src.ds.feature_store import FeatureStore, FeatureDefinition
from src.database.connection import get_db_connection

## Initialize Feature Store

In [None]:
# Initialize feature store
db = get_db_connection()
feature_store = FeatureStore(
    db_connection=db,
    cache_backend=None  # Set to "redis" for production
)

print("✓ Feature Store initialized")

## Create Sample Data

In [None]:
# Generate synthetic patient data
np.random.seed(42)
n_patients = 100

raw_data = pd.DataFrame({
    'patient_id': [f"P{i:04d}" for i in range(1, n_patients + 1)],
    'age': np.random.randint(18, 80, n_patients),
    'gender': np.random.choice(['M', 'F', 'Other'], n_patients),
    'phq9_score': np.random.randint(0, 27, n_patients),
    'gad7_score': np.random.randint(0, 21, n_patients),
    'pcl5_score': np.random.randint(0, 80, n_patients),
    'sleep_hours': np.random.uniform(3, 10, n_patients),
    'previous_episodes': np.random.randint(0, 5, n_patients),
    'social_support_score': np.random.randint(1, 10, n_patients)
})

print(f"Raw data shape: {raw_data.shape}")
raw_data.head()

## Register Simple Features

In [None]:
# Feature 1: Normalized age
age_norm_def = FeatureDefinition(
    feature_name="age_normalized",
    feature_type="numeric",
    description="Age normalized to 0-1 scale (18-100 years)",
    transformation_code="""
def transform(df):
    return (df['age'] - 18) / (100 - 18)
""",
    input_schema={"age": "int"},
    output_schema={"age_normalized": "float"},
    version="v1.0",
    dependencies=[],
    owner="data_science_team"
)

feature_store.register_feature("age_normalized", age_norm_def)
print("✓ Registered: age_normalized")

In [None]:
# Feature 2: Composite severity score
severity_def = FeatureDefinition(
    feature_name="severity_score",
    feature_type="numeric",
    description="Weighted composite severity score from PHQ-9, GAD-7, and PCL-5",
    transformation_code="""
def transform(df):
    weights = {'phq9': 0.4, 'gad7': 0.3, 'pcl5': 0.3}
    score = (
        df['phq9_score'] / 27 * weights['phq9'] +
        df['gad7_score'] / 21 * weights['gad7'] +
        df['pcl5_score'] / 80 * weights['pcl5']
    )
    return score * 100  # Scale to 0-100
""",
    input_schema={
        "phq9_score": "int",
        "gad7_score": "int",
        "pcl5_score": "int"
    },
    output_schema={"severity_score": "float"},
    version="v1.0",
    dependencies=[],
    owner="clinical_team"
)

feature_store.register_feature("severity_score", severity_def)
print("✓ Registered: severity_score")

In [None]:
# Feature 3: Risk category
risk_category_def = FeatureDefinition(
    feature_name="risk_category",
    feature_type="categorical",
    description="Risk category based on severity score",
    transformation_code="""
def transform(df):
    import pandas as pd
    return pd.cut(
        df['severity_score'],
        bins=[0, 30, 60, 100],
        labels=['low', 'medium', 'high']
    ).astype(str)
""",
    input_schema={"severity_score": "float"},
    output_schema={"risk_category": "str"},
    version="v1.0",
    dependencies=["severity_score"],  # Depends on severity_score
    owner="clinical_team"
)

feature_store.register_feature("risk_category", risk_category_def)
print("✓ Registered: risk_category")

## Register Complex Features

In [None]:
# Feature 4: Sleep quality indicator
sleep_quality_def = FeatureDefinition(
    feature_name="sleep_quality_indicator",
    feature_type="numeric",
    description="Sleep quality score (0-1, higher is better)",
    transformation_code="""
def transform(df):
    import numpy as np
    # Optimal sleep is 7-9 hours
    optimal_min, optimal_max = 7, 9
    quality = np.where(
        (df['sleep_hours'] >= optimal_min) & (df['sleep_hours'] <= optimal_max),
        1.0,
        1.0 - np.minimum(np.abs(df['sleep_hours'] - 8) / 5, 1.0)
    )
    return quality
""",
    input_schema={"sleep_hours": "float"},
    output_schema={"sleep_quality_indicator": "float"},
    version="v1.0",
    dependencies=[],
    owner="data_science_team"
)

feature_store.register_feature("sleep_quality_indicator", sleep_quality_def)
print("✓ Registered: sleep_quality_indicator")

In [None]:
# Feature 5: Protective factors score
protective_def = FeatureDefinition(
    feature_name="protective_factors_score",
    feature_type="numeric",
    description="Composite score of protective factors (social support, sleep)",
    transformation_code="""
def transform(df):
    # Combine social support and sleep quality
    social_norm = df['social_support_score'] / 10
    protective = (social_norm * 0.6 + df['sleep_quality_indicator'] * 0.4) * 100
    return protective
""",
    input_schema={
        "social_support_score": "int",
        "sleep_quality_indicator": "float"
    },
    output_schema={"protective_factors_score": "float"},
    version="v1.0",
    dependencies=["sleep_quality_indicator"],
    owner="clinical_team"
)

feature_store.register_feature("protective_factors_score", protective_def)
print("✓ Registered: protective_factors_score")

## Compute Features

In [None]:
# Compute all features
feature_names = [
    "age_normalized",
    "severity_score",
    "risk_category",
    "sleep_quality_indicator",
    "protective_factors_score"
]

features = feature_store.compute_features(
    feature_names=feature_names,
    input_data=raw_data
)

print(f"Computed features shape: {features.shape}")
print(f"\nFeature columns: {list(features.columns)}")
features.head()

## Analyze Computed Features

In [None]:
# Summary statistics
print("Feature Statistics:")
print("=" * 60)
for col in features.columns:
    if features[col].dtype in ['float64', 'int64']:
        print(f"\n{col}:")
        print(f"  Mean: {features[col].mean():.3f}")
        print(f"  Std: {features[col].std():.3f}")
        print(f"  Min: {features[col].min():.3f}")
        print(f"  Max: {features[col].max():.3f}")
    else:
        print(f"\n{col}:")
        print(f"  Value counts:\n{features[col].value_counts()}")

In [None]:
# Visualize feature distributions
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Feature Distributions', fontsize=16)

# Age normalized
axes[0, 0].hist(features['age_normalized'], bins=20, edgecolor='black')
axes[0, 0].set_title('Age Normalized')
axes[0, 0].set_xlabel('Value')
axes[0, 0].set_ylabel('Frequency')

# Severity score
axes[0, 1].hist(features['severity_score'], bins=20, edgecolor='black', color='orange')
axes[0, 1].set_title('Severity Score')
axes[0, 1].set_xlabel('Score (0-100)')
axes[0, 1].set_ylabel('Frequency')

# Risk category
risk_counts = features['risk_category'].value_counts()
axes[1, 0].bar(risk_counts.index, risk_counts.values, color=['green', 'yellow', 'red'])
axes[1, 0].set_title('Risk Category Distribution')
axes[1, 0].set_xlabel('Category')
axes[1, 0].set_ylabel('Count')

# Protective factors
axes[1, 1].hist(features['protective_factors_score'], bins=20, edgecolor='black', color='green')
axes[1, 1].set_title('Protective Factors Score')
axes[1, 1].set_xlabel('Score (0-100)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Feature Correlations

In [None]:
# Compute correlations
numeric_features = features.select_dtypes(include=['float64', 'int64'])
correlations = numeric_features.corr()

# Visualize correlation matrix
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Use Features for Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Create target variable (high risk if severity > 60)
y = (features['severity_score'] > 60).astype(int)

# Select features for training
X = features[[
    'age_normalized',
    'severity_score',
    'sleep_quality_indicator',
    'protective_factors_score'
]]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Model Performance:")
print(classification_report(y_test, y_pred, target_names=['Low Risk', 'High Risk']))

## Feature Importance

In [None]:
# Plot feature importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance for Risk Prediction')
plt.tight_layout()
plt.show()

print("\nFeature Importance:")
print(importance_df)

## Materialize Features

In [None]:
# Materialize features for faster access
from src.ds.data_versioning import DataVersionControl
from src.ds.storage import FileSystemStorage

# Version the input dataset
storage = FileSystemStorage(base_path="../data/versions")
dvc = DataVersionControl(storage_backend=storage, db_connection=db)

dataset_version = dvc.register_dataset(
    dataset=raw_data,
    dataset_name="patient_data",
    source="synthetic"
)

# Materialize features
materialized_path = feature_store.materialize_features(
    feature_names=feature_names,
    dataset_version_id=str(dataset_version.version_id)
)

print(f"✓ Features materialized to: {materialized_path}")

## Online Feature Serving

In [None]:
# Simulate online serving for a single patient
patient_id = "P0001"

# In production, this would fetch from cache or compute on-demand
patient_features = feature_store.get_features(
    feature_names=feature_names,
    entity_ids=[patient_id],
    mode="online"
)

print(f"Features for {patient_id}:")
print(patient_features)

## Summary

This notebook demonstrated:

1. **Registering features** with transformation code and metadata
2. **Computing features** from raw data
3. **Feature dependencies** (e.g., risk_category depends on severity_score)
4. **Analyzing features** with statistics and visualizations
5. **Using features** for model training
6. **Materializing features** for faster access
7. **Online serving** for real-time predictions

### Key Benefits

- **Consistency**: Same features in training and inference
- **Reusability**: Share features across projects
- **Versioning**: Track feature evolution
- **Performance**: Cache and materialize for speed

### Next Steps

- Add more complex features (embeddings, time-series)
- Set up Redis for production caching
- Implement feature monitoring
- Create feature groups for different use cases