In [0]:
# Install the Databricks Feature Engineering library
# This library provides tools for creating and managing feature tables in Unity Catalog
%pip install databricks-feature-engineering

# Restart Python kernel to ensure all dependencies are properly initialized
dbutils.library.restartPython()

# Feature Tables Creation

## Overview
In this notebook, we will create feature tables from CSV files hosted on GitHub.

## Dataset Description
The data represents an e-commerce business with the following entities:
- **customers**: Customer information and profiles
- **orders**: Order transactions and details
- **order_items**: Individual items within each order
- **products**: Product catalog and specifications
- **shipping_zones**: Geographic shipping information

## Purpose
These feature tables will be used for:
- Machine Learning model training
- Real-time feature serving
- Analytics and reporting
- Data lineage tracking through Unity Catalog

In [0]:
%run ../_config/config_unity_catalog

In [0]:
# ====================================================================
# STEP 1: LOAD CSV FILES FROM GITHUB TO UNITY CATALOG VOLUME
# ====================================================================

# Define the source location (GitHub raw content URL)
prefix = "https://github.com/O-Faraday/databricks_genai_demo/raw/main/"

# This is the staging area for raw data before creating feature tables
path_volume = f"/Volumes/{catalog}/{schema}/raw_data/"

# List of CSV file names to download and process
l_csv_names = [
    "customers",        # Customer master data (demographics, contact info)
    "orders",           # Order transactions (order_id, date, status, total)
    "order_items",      # Line items for each order (product_id, quantity, price)
    "products",         # Product catalog (name, category, price, inventory)
    "shipping_zones"    # Shipping zones and rates by geography
]

# Download each CSV file from GitHub to the Unity Catalog volume
for csv_name in l_csv_names:
    # Construct source path
    source_path = f"{prefix}data/csv/{csv_name}.csv"
    
    # Construct destination path
    destination_path = f"{path_volume}/csv/{csv_name}.csv"
    
    # Copy the file using Databricks File System utilities
    dbutils.fs.cp(
        source_path,
        destination_path
    )
    

## Load CSV Data into DataFrames

### Purpose
In this step, we load CSV files from the Unity Catalog volume into Spark DataFrames.



In [0]:
# ====================================================================
# UTILITY FUNCTIONS FOR DATA TRANSFORMATION
# ====================================================================

from pyspark.sql.functions import monotonically_increasing_id

def addIdColumn(dataframe, id_column_name):
    """
    Add a unique ID column to a DataFrame as the first column.
    
    This function is essential for creating feature tables because:
    - Feature tables MUST have a primary key
    - The ID column ensures each row is uniquely identifiable
    - monotonically_increasing_id() generates distributed unique IDs
    
    Args:
        dataframe: Input Spark DataFrame
        id_column_name: Name for the new ID column (e.g., 'customer_feature_id')
    
    Returns:
        DataFrame with ID column as the first column
    
    Note:
        - IDs are guaranteed unique within a Spark session
        - IDs are monotonically increasing but may have gaps
        - The ID column is positioned first for better readability
    """
    # Store original column names
    columns = dataframe.columns
    
    # Add ID column 
    new_df = dataframe.withColumn(id_column_name, monotonically_increasing_id())

    # This improves readability and follows best practices for feature tables
    return new_df[[id_column_name] + columns]


def renameColumns(df):
    """
    Rename columns to be compatible with Unity Catalog Feature Engineering.
    
    Unity Catalog has naming restrictions:
    - Column names cannot contain spaces
    - Spaces must be replaced with underscores
    
    Args:
        df: Input Spark DataFrame with potentially non-compliant column names
    
    Returns:
        DataFrame with sanitized column names (spaces replaced by underscores)
    
    Example:
        Input columns: ['Customer Name', 'Order Date', 'Total Amount']
        Output columns: ['Customer_Name', 'Order_Date', 'Total_Amount']
    """
    renamed_df = df
    
    # Iterate through each column and replace spaces with underscores
    for column in df.columns:
        # withColumnRenamed() creates a new DataFrame with the renamed column
        renamed_df = renamed_df.withColumnRenamed(column, column.replace(' ', '_'))
    
    return renamed_df


def compute_customer_features(path_volume, table_name):
    """
    Load and transform CSV data into a feature-ready DataFrame.
    
    This function represents a feature engineering pipeline:
    1. Load raw CSV data from Unity Catalog volume
    2. Infer schema automatically from CSV structure
    3. Clean column names (remove spaces)
    4. Add primary key column for feature table
    
    Args:
        path_volume: path to the csv volume 
        table_name: Name of the CSV file 
    
    Returns:
        Spark DataFrame with:
        - Primary key column: {table_name}_feature_id
        - All original columns with sanitized names
        - Proper data types inferred from CSV
    
    Note:
        - inferSchema=True: Automatically detects column data types
        - header=True: First row contains column names
        - sep=",": Comma-separated values
    """
    # Load CSV file from Unity Catalog volume
    # spark.read.csv() creates a DataFrame from CSV data
    df = spark.read.csv(
        f"{path_volume}/csv/{table_name}.csv",
        header=True,        # First row contains column names
        inferSchema=True,   # Automatically detect data types (int, string, date, etc.)
        sep=","             # Comma delimiter (standard CSV)
    )
    
    # Transform the DataFrame for feature table compatibility
    # Step 1: Rename columns to replace spaces with underscores
    renamed_df = renameColumns(df)
    
    # Step 2: Add primary key column as the first column
    # Primary key format: {table_name}_feature_id
    # Example: 'customers_feature_id', 'orders_feature_id'
    return addIdColumn(renamed_df, f'{table_name}_feature_id')

## Create Feature Tables in Unity Catalog




In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
# Initialize the Feature Engineering Client
fe = FeatureEngineeringClient()


# ====================================================================
# MAIN LOOP: CREATE FEATURE TABLE FOR EACH CSV FILE
# ====================================================================

# Process each CSV file and create corresponding feature table
for csv_name in l_csv_names:
    print(f"\n{'='*10}")
    print(f"Processing table: {csv_name}")

    # Step 1: Load and transform CSV data into feature DataFrame
    features_df = compute_customer_features(path_volume=path_volume, table_name=csv_name)
    
    # Step 2: Create feature table in Unity Catalog
    feature_table = fe.create_table(
        name=f'{catalog}.{schema}.{csv_name}_feature',     # Full table name: catalog.schema.table_name_feature
        primary_keys=f'{csv_name}_feature_id',       # Primary key column (must be unique)
        df=features_df,                               # DataFrame containing the feature data
        schema=features_df.schema,                    # Schema inferred from DataFrame
        description=f'{csv_name} features'            # Human-readable description for documentation
    )
    
    # Step 3: Log the schema for verification and documentation
    print(f"\nâœ“ Created feature table: {catalog}.{schema}.{csv_name}_feature")
    print(f"\nTable schema:")
    print(f"{features_df.schema}")




# ====================================================================
# RESULT: Feature tables are now available at:
# - {catalog}.{schema}.customers_feature
# - {catalog}.{schema}.orders_feature
# - {catalog}.{schema}.order_items_feature
# - {catalog}.{schema}.products_feature
# - {catalog}.{schema}.shipping_zones_feature
# ====================================================================