# ðŸ”§ Feature Engineering Pipeline

**Project:** Predicting Paid Amount for Medical Claims  
**Stage:** Feature Engineering & Data Preparation  

---

## Overview

1. **Data Cleaning** - Handle missing values and remove invalid records
2. **Feature Transformation** - Encode categorical variables, normalize numerics
3. **Feature Creation** - Create derived features (diagnosis counts, ICD categories)
4. **Feature Selection** - Select most predictive features
5. **Data Preparation** - Prepare final dataset for modeling

In [None]:
# Imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np

from src.utils.logger import setup_logging, get_logger, PipelineLogger
from src.data.data_loader import DataLoader
from src.data.data_processor import DataCleaner, DataProcessor, DataPipeline
from src.features.feature_engineering import FeatureEngineer, FeatureSelector

setup_logging(log_level="INFO")
logger = get_logger(__name__)

# Paths
INTERIM_DIR = project_root / "data" / "interim"
PROCESSED_DIR = project_root / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("âœ“ Setup complete")

In [None]:
# Load data
parquet_path = INTERIM_DIR / "sampled_claims.parquet"

if parquet_path.exists():
    df = pd.read_parquet(parquet_path)
else:
    # Create demo data
    np.random.seed(42)
    n = 50000
    df = pd.DataFrame({
        'CLAIM_ID_KEY': np.random.randint(1, 20000, n),
        'AGE': np.random.choice(['25', '35', '45', '55', '65', '75', '90+'], n),
        'SEX': np.random.choice(['M', 'F'], n),
        'AMT_BILLED': np.abs(np.random.exponential(1000, n)),
        'AMT_PAID': np.abs(np.random.exponential(500, n)),
        'AMT_DEDUCT': np.abs(np.random.exponential(100, n)),
        'AMT_COINS': np.abs(np.random.exponential(50, n)),
        'FORM_TYPE': np.random.choice(['P', 'I', 'O'], n),
        'SV_STAT': np.random.choice(['P', 'D', 'R'], n),
        'PRODUCT_TYPE': np.random.choice(['HMO', 'PPO', 'POS'], n),
        'ICD_DIAG_01_PRIMARY': np.random.choice(['Z00', 'J06', 'M54', 'I10', 'K21'], n),
        'CLIENT_LOS': np.random.choice([0, 1, 2, 3, np.nan], n, p=[0.7, 0.1, 0.08, 0.07, 0.05]),
    })

print(f"âœ“ Loaded data: {len(df):,} rows, {len(df.columns)} columns")
df.head()

## 2. Data Cleaning

In [None]:
# Initialize cleaners and processors
cleaner = DataCleaner(missing_threshold=1000000)
processor = DataProcessor()

with PipelineLogger("Data Cleaning", logger):
    initial_shape = df.shape
    
    # Handle missing values
    df = cleaner.handle_missing_values(df, fill_values={'CLIENT_LOS': 0})
    
    # Remove negative amounts
    amount_cols = ['AMT_BILLED', 'AMT_PAID', 'AMT_DEDUCT', 'AMT_COINS']
    amount_cols = [c for c in amount_cols if c in df.columns]
    df = cleaner.remove_negative_values(df, amount_cols)
    
    # Drop remaining rows with missing values
    df = df.dropna()
    
    print(f"Shape: {initial_shape} -> {df.shape}")
    print(f"Removed {initial_shape[0] - len(df):,} rows")

## 3. Feature Transformation

In [None]:
with PipelineLogger("Feature Transformation", logger):
    # Encode gender
    if 'SEX' in df.columns:
        df = processor.encode_gender(df, column='SEX', new_column='Gender_Code')
    
    # Encode age
    if 'AGE' in df.columns:
        df = processor.encode_age(df, column='AGE', new_column='Age')
    
    # Extract ICD category
    if 'ICD_DIAG_01_PRIMARY' in df.columns:
        df = processor.extract_code_category(df, 'ICD_DIAG_01_PRIMARY', 'ICD_Category')

print(f"âœ“ Transformed data: {len(df.columns)} columns")
df.head()

## 4. Feature Engineering

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

with PipelineLogger("Creating Features", logger):
    # Remove ID column if present
    if 'CLAIM_ID_KEY' in df.columns:
        df = df.drop(columns=['CLAIM_ID_KEY'])
    
    # Define columns
    categorical_cols = ['FORM_TYPE', 'SV_STAT', 'PRODUCT_TYPE', 'ICD_Category']
    categorical_cols = [c for c in categorical_cols if c in df.columns]
    
    numerical_cols = ['AMT_BILLED', 'AMT_DEDUCT', 'AMT_COINS', 'CLIENT_LOS', 'Age', 'Gender_Code']
    numerical_cols = [c for c in numerical_cols if c in df.columns]
    
    # Create dummy variables
    df = feature_engineer.create_dummy_variables(df, categorical_cols)
    
    # Create log features for amounts
    if 'AMT_BILLED' in df.columns:
        df['AMT_BILLED_log'] = np.log1p(df['AMT_BILLED'])

print(f"âœ“ Final features: {len(df.columns)} columns")
print(f"Columns: {list(df.columns)[:15]}...")

## 5. Prepare Training Data

In [None]:
# Separate features and target
TARGET_COLUMN = 'AMT_PAID'

if TARGET_COLUMN in df.columns:
    y = df[TARGET_COLUMN]
    X = df.drop(columns=[TARGET_COLUMN])
    
    # Standardize numerical features (excluding target)
    num_cols_to_scale = [c for c in X.columns if X[c].dtype in ['float64', 'int64']]
    X = feature_engineer.fit_scalers(X, num_cols_to_scale, method='zscore')
    
    print(f"âœ“ Features (X): {X.shape}")
    print(f"âœ“ Target (y): {y.shape}")
    print(f"  Target mean: ${y.mean():,.2f}")
    print(f"  Target std: ${y.std():,.2f}")
else:
    print("âš  Target column not found")

In [None]:
# Save processed data
processed_df = pd.concat([X, y], axis=1)
output_path = PROCESSED_DIR / "processed_claims.parquet"
processed_df.to_parquet(output_path, index=False)

# Save transformer state
feature_engineer.save_state(str(PROCESSED_DIR / "transformer_state.pkl"))

print(f"\nðŸ’¾ Saved:")
print(f"  Processed data: {output_path}")
print(f"  Transformer state: {PROCESSED_DIR / 'transformer_state.pkl'}")

print("\n" + "="*60)
print("ðŸ“Š FEATURE ENGINEERING SUMMARY")
print("="*60)
print(f"  Final shape: {processed_df.shape}")
print(f"  Feature columns: {len(X.columns)}")
print(f"\nâœ… Feature engineering completed! Next: Run 04_model_training.ipynb")