# Feature Engineering for Portfolio Management

This notebook demonstrates the FeatureEngineeringAgent functionality:
1. Import and initialize the FeatureEngineeringAgent
2. Process sample tickers (AAPL, MSFT) to create financial features
3. Validate output and display sample data from Unity Catalog tables
4. Analyze the generated features for ML readiness

## 1. Setup and Imports

Import required libraries and initialize Spark session if needed.

In [None]:
# Import required libraries
import sys
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime

# Add the src directory to the Python path
sys.path.append('../src')

# Import our custom FeatureEngineeringAgent
from agents.feature_engineering_agent import FeatureEngineeringAgent, FeatureEngineeringError

print("‚úÖ Libraries imported successfully")
print(f"Python path includes: {[p for p in sys.path if 'src' in p]}")

## 2. Initialize Spark Session and FeatureEngineeringAgent

Create Spark session and initialize the feature engineering agent with Unity Catalog configuration.

In [None]:
# Initialize Spark session (if not already available in Databricks)
# In Databricks, spark session is usually pre-configured
try:
    # Check if spark session already exists (common in Databricks)
    spark_session = spark
    print("‚úÖ Using existing Spark session from Databricks")
except NameError:
    # Create new Spark session if not in Databricks environment
    spark_session = SparkSession.builder \
        .appName("FeatureEngineering") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    print("‚úÖ Created new Spark session")

print(f"Spark version: {spark_session.version}")
print(f"Spark application: {spark_session.sparkContext.appName}")

In [None]:
# Initialize FeatureEngineeringAgent
print("üöÄ Initializing FeatureEngineeringAgent...")

# Create agent with Unity Catalog configuration
feature_agent = FeatureEngineeringAgent(catalog="main", schema="finance")

print(f"‚úÖ FeatureEngineeringAgent initialized:")
print(f"   - Catalog: {feature_agent.catalog}")
print(f"   - Schema: {feature_agent.schema}")
print(f"   - Target namespace: {feature_agent.catalog}.{feature_agent.schema}")

## 3. Check Available Raw Data

Before running feature engineering, let's verify that the raw data tables exist in Unity Catalog.

In [None]:
# Check available tables in the finance schema
print("üìä Checking available tables in main.finance schema...")

try:
    tables = spark_session.sql("SHOW TABLES IN main.finance").collect()
    
    if tables:
        print("\nüìã Available tables:")
        for table in tables:
            table_name = table['tableName']
            print(f"   - {table_name}")
            
            # Check if it's a raw data table for our target tickers
            if any(ticker.lower() in table_name.lower() for ticker in ['aapl', 'msft']):
                # Show sample data
                print(f"     Sample data for {table_name}:")
                sample_df = spark_session.table(f"main.finance.{table_name}")
                sample_df.select("ticker", "date", "close", "volume").limit(3).show()
    else:
        print("‚ö†Ô∏è No tables found in main.finance schema")
        print("   Make sure to run the data ingestion notebook first")
        
except Exception as e:
    print(f"‚ùå Error checking tables: {str(e)}")
    print("   This might indicate Unity Catalog is not properly configured")

## 4. Run Feature Engineering

Process AAPL and MSFT tickers to create financial features.

In [None]:
# Define target tickers for feature engineering
target_tickers = ["AAPL", "MSFT"]

print(f"üîß Starting feature engineering for: {', '.join(target_tickers)}")
print(f"   Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

try:
    # Process tickers through feature engineering pipeline
    results = feature_agent.process_tickers(target_tickers)
    
    print("\n‚úÖ Feature engineering completed!")
    print(f"   Duration: {(results['end_time'] - results['start_time']).total_seconds():.2f} seconds")
    print(f"   Total features created: {results['total_features_created']}")
    
    if results['processed_tickers']:
        print(f"   ‚úÖ Successfully processed: {', '.join(results['processed_tickers'])}")
    
    if results['failed_tickers']:
        print(f"   ‚ùå Failed tickers: {', '.join(results['failed_tickers'])}")
    
except Exception as e:
    print(f"‚ùå Feature engineering failed: {str(e)}")
    print("   Check the logs for detailed error information")
    results = None

## 5. Validate Output Tables

Check that the feature tables were created successfully in Unity Catalog.

In [None]:
# Check for newly created feature tables
print("üîç Validating output tables...")

feature_tables = []
for ticker in target_tickers:
    table_name = f"main.finance.features_{ticker}"
    
    try:
        # Check if table exists
        table_exists = spark_session.catalog.tableExists(table_name)
        
        if table_exists:
            print(f"‚úÖ {table_name} exists")
            feature_tables.append(table_name)
            
            # Get table info
            df = spark_session.table(table_name)
            row_count = df.count()
            col_count = len(df.columns)
            
            print(f"   üìä Table stats: {row_count:,} rows, {col_count} columns")
            
            # Show column names
            print(f"   üìã Columns: {', '.join(df.columns)}")
            
        else:
            print(f"‚ùå {table_name} does not exist")
            
    except Exception as e:
        print(f"‚ùå Error checking {table_name}: {str(e)}")

print(f"\nüìà Total feature tables created: {len(feature_tables)}")

## 6. Display Sample Feature Data

Show sample data from the generated feature tables to verify quality.

In [None]:
# Display sample data from feature tables
print("üìã Sample Feature Data")
print("=" * 50)

for table_name in feature_tables:
    ticker = table_name.split('_')[-1]  # Extract ticker from table name
    
    print(f"\nüè∑Ô∏è {ticker} Features ({table_name})")
    print("-" * 40)
    
    try:
        df = spark_session.table(table_name)
        
        # Show recent data (last 5 rows)
        print("\nüìÖ Most Recent 5 Records:")
        df.orderBy(F.desc("date")).limit(5).show(truncate=False)
        
        # Show feature summary statistics
        print("\nüìä Feature Statistics:")
        feature_cols = ['daily_return', 'moving_avg_7', 'moving_avg_30', 'volatility_7', 'momentum']
        df.select(feature_cols).summary().show()
        
    except Exception as e:
        print(f"‚ùå Error displaying data for {table_name}: {str(e)}")

## 7. Feature Quality Analysis

Analyze the quality and completeness of generated features.

In [None]:
# Analyze feature quality
print("üî¨ Feature Quality Analysis")
print("=" * 50)

for table_name in feature_tables:
    ticker = table_name.split('_')[-1]
    
    print(f"\nüìà Analysis for {ticker}")
    print("-" * 30)
    
    try:
        df = spark_session.table(table_name)
        total_rows = df.count()
        
        # Check for null values in key features
        feature_cols = ['daily_return', 'moving_avg_7', 'moving_avg_30', 'volatility_7', 'momentum']
        
        print(f"üìä Data Completeness (out of {total_rows:,} total rows):")
        for col in feature_cols:
            null_count = df.filter(F.col(col).isNull()).count()
            non_null_count = total_rows - null_count
            completeness = (non_null_count / total_rows) * 100 if total_rows > 0 else 0
            
            status = "‚úÖ" if completeness >= 95 else "‚ö†Ô∏è" if completeness >= 80 else "‚ùå"
            print(f"   {status} {col}: {completeness:.1f}% complete ({non_null_count:,} values)")
        
        # Check date range
        date_stats = df.select(
            F.min("date").alias("min_date"),
            F.max("date").alias("max_date"),
            F.count("date").alias("total_days")
        ).collect()[0]
        
        print(f"\nüìÖ Date Range:")
        print(f"   From: {date_stats['min_date']}")
        print(f"   To: {date_stats['max_date']}")
        print(f"   Total trading days: {date_stats['total_days']:,}")
        
        # Feature value ranges
        print(f"\nüìè Feature Ranges:")
        for col in ['daily_return', 'volatility_7', 'momentum']:
            stats = df.select(
                F.min(col).alias('min_val'),
                F.max(col).alias('max_val'),
                F.avg(col).alias('avg_val')
            ).collect()[0]
            
            print(f"   {col}: [{stats['min_val']:.4f}, {stats['max_val']:.4f}] (avg: {stats['avg_val']:.4f})")
        
    except Exception as e:
        print(f"‚ùå Error analyzing {table_name}: {str(e)}")

## 8. Verification Queries

Run some verification queries to ensure data consistency and feature correctness.

In [None]:
# Run verification queries
print("üîç Data Verification Queries")
print("=" * 50)

if feature_tables:
    # Query 1: Check if daily returns are calculated correctly
    print("\nüìä Query 1: Daily Return Calculation Verification")
    print("Checking if daily_return = (close - prev_close) / prev_close")
    
    for table_name in feature_tables[:1]:  # Check first table only
        ticker = table_name.split('_')[-1]
        
        verification_query = f"""
        SELECT 
            ticker,
            date,
            close,
            LAG(close, 1) OVER (PARTITION BY ticker ORDER BY date) as prev_close,
            daily_return,
            ROUND(
                (close - LAG(close, 1) OVER (PARTITION BY ticker ORDER BY date)) / 
                LAG(close, 1) OVER (PARTITION BY ticker ORDER BY date), 
                6
            ) as calculated_return
        FROM {table_name}
        WHERE date >= (SELECT MAX(date) - INTERVAL 7 DAYS FROM {table_name})
        ORDER BY date DESC
        LIMIT 5
        """
        
        try:
            result = spark_session.sql(verification_query)
            print(f"\n{ticker} - Recent daily returns:")
            result.show(truncate=False)
        except Exception as e:
            print(f"‚ùå Error in verification query: {str(e)}")

    # Query 2: Check moving averages
    print("\nüìà Query 2: Moving Average Verification")
    print("Checking 7-day and 30-day moving averages")
    
    for table_name in feature_tables[:1]:  # Check first table only
        ticker = table_name.split('_')[-1]
        
        ma_query = f"""
        SELECT 
            date,
            close,
            moving_avg_7,
            moving_avg_30,
            ROUND(
                AVG(close) OVER (
                    PARTITION BY ticker 
                    ORDER BY date 
                    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
                ), 2
            ) as calculated_ma7
        FROM {table_name}
        WHERE date >= (SELECT MAX(date) - INTERVAL 10 DAYS FROM {table_name})
        ORDER BY date DESC
        LIMIT 5
        """
        
        try:
            result = spark_session.sql(ma_query)
            print(f"\n{ticker} - Recent moving averages:")
            result.show(truncate=False)
        except Exception as e:
            print(f"‚ùå Error in moving average query: {str(e)}")

else:
    print("‚ö†Ô∏è No feature tables available for verification")

## 9. Summary and Next Steps

Summarize the feature engineering results and provide guidance for next steps.

In [None]:
# Final summary
print("üìã Feature Engineering Summary")
print("=" * 50)

if 'results' in locals() and results:
    print(f"\n‚úÖ Feature Engineering Completed Successfully")
    print(f"   - Target tickers: {', '.join(target_tickers)}")
    print(f"   - Successfully processed: {len(results['processed_tickers'])} tickers")
    print(f"   - Failed: {len(results['failed_tickers'])} tickers")
    print(f"   - Total features created: {results['total_features_created']:,}")
    print(f"   - Processing time: {(results['end_time'] - results['start_time']).total_seconds():.2f} seconds")
    
    if results['processed_tickers']:
        print(f"\nüìä Available Feature Tables:")
        for ticker in results['processed_tickers']:
            print(f"   - main.finance.features_{ticker}")
            
    print(f"\nüéØ Created Features:")
    feature_list = [
        "daily_return (Daily price return)",
        "moving_avg_7 (7-day moving average)",
        "moving_avg_30 (30-day moving average)",
        "volatility_7 (7-day rolling volatility)",
        "momentum (Price momentum indicator)",
        "feature_timestamp (Feature creation date)"
    ]
    
    for feature in feature_list:
        print(f"   ‚úÖ {feature}")
        
else:
    print("‚ùå Feature engineering did not complete successfully")
    print("   Please check the error messages above and retry")

print(f"\nüöÄ Next Steps:")
print(f"   1. Review the generated features for data quality")
print(f"   2. Use these feature tables for ML model training")
print(f"   3. Set up scheduled jobs for regular feature updates")
print(f"   4. Consider adding more advanced features (technical indicators, etc.)")
print(f"   5. Implement feature monitoring and alerting")

print(f"\n‚ú® Feature engineering notebook completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")