# HEDIS GSD Prediction Engine - Data Exploration & Model Development

## 🎯 Project Overview

**Goal:** Build a production-ready AI system for predicting diabetic patients at risk of poor glycemic control to improve HEDIS GSD measure performance.

**HEDIS Specification:** MY2023 Volume 2  
**Measure:** HBD - Hemoglobin A1c Control for Patients with Diabetes  
**Target:** Members with most recent HbA1c >9.0% (poor control)

## 📊 Current Performance Baseline
- **Model:** Logistic Regression
- **AUC-ROC:** 0.91
- **Features:** 25 engineered features
- **Population:** 24,935 diabetic members

## 🔬 Analysis Objectives
1. **Data Exploration:** Understand CMS DE-SynPUF data structure
2. **Feature Engineering:** Create HEDIS-compliant features
3. **Model Training:** Reproduce and improve baseline performance
4. **SHAP Analysis:** Interpret model predictions for clinical insights
5. **Validation:** Ensure temporal validation and no data leakage


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import sys
import os

# Add src to path for imports
sys.path.append('../src')

# Import our custom modules
from data.data_loader import load_cms_data
from data.data_preprocessing import preprocess_cms_data
from data.feature_engineering import create_hedis_gsd_features

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("✅ Libraries imported successfully")
print(f"📅 Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🐍 Python Version: {sys.version}")
print(f"📊 Pandas Version: {pd.__version__}")
print(f"🔢 NumPy Version: {np.__version__}")


## 📁 Data Loading & Initial Exploration

### Step 1: Load Raw CMS DE-SynPUF Data

We'll load the CMS DE-SynPUF data using our HIPAA-compliant data loader that:
- ✅ Never logs raw member identifiers
- ✅ Uses SHA-256 hashing for audit trails
- ✅ Validates schemas and data types
- ✅ Provides comprehensive error handling


In [None]:
# Load raw CMS data
print("🔄 Loading CMS DE-SynPUF data...")
try:
    raw_data = load_cms_data()
    print("✅ Data loaded successfully!")
    
    # Display data summary (PHI-safe)
    print("\n📊 Data Summary:")
    for data_type, df in raw_data.items():
        print(f"  {data_type.title()}: {len(df):,} records")
        print(f"    Columns: {len(df.columns)}")
        print(f"    Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("Please ensure CMS data files are in data/raw/ directory")


In [None]:
# Explore beneficiary data structure (PHI-safe)
print("🔍 Beneficiary Data Exploration:")
beneficiary_df = raw_data['beneficiary']

print(f"\n📋 Column Information:")
print(f"Total columns: {len(beneficiary_df.columns)}")
print(f"Total records: {len(beneficiary_df):,}")

print(f"\n📊 Data Types:")
print(beneficiary_df.dtypes.value_counts())

print(f"\n🔢 Numeric Columns Summary:")
numeric_cols = beneficiary_df.select_dtypes(include=[np.number]).columns
print(beneficiary_df[numeric_cols].describe())

print(f"\n📅 Date Columns:")
date_cols = ['BENE_BIRTH_DT', 'BENE_DEATH_DT']
for col in date_cols:
    if col in beneficiary_df.columns:
        print(f"  {col}: {beneficiary_df[col].notna().sum():,} valid dates")

print(f"\n🏥 Diabetes Indicator (SP_DIABETES):")
if 'SP_DIABETES' in beneficiary_df.columns:
    diabetes_counts = beneficiary_df['SP_DIABETES'].value_counts()
    print(f"  No Diabetes (0): {diabetes_counts.get(0, 0):,}")
    print(f"  Diabetes (1): {diabetes_counts.get(1, 0):,}")
    print(f"  Missing: {beneficiary_df['SP_DIABETES'].isna().sum():,}")


In [None]:
# Explore claims data structure (PHI-safe)
print("🔍 Claims Data Exploration:")

for claim_type in ['inpatient', 'outpatient']:
    if claim_type in raw_data:
        claims_df = raw_data[claim_type]
        print(f"\n📋 {claim_type.title()} Claims:")
        print(f"  Total records: {len(claims_df):,}")
        print(f"  Total columns: {len(claims_df.columns)}")
        print(f"  Memory usage: {claims_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        # Payment statistics
        if 'CLM_PMT_AMT' in claims_df.columns:
            payment_stats = claims_df['CLM_PMT_AMT'].describe()
            print(f"  Payment Statistics:")
            print(f"    Total: ${claims_df['CLM_PMT_AMT'].sum():,.2f}")
            print(f"    Average: ${payment_stats['mean']:.2f}")
            print(f"    Median: ${payment_stats['50%']:.2f}")
        
        # Diagnosis codes
        diag_cols = [col for col in claims_df.columns if col.startswith('ICD9_DGNS_CD_')]
        print(f"  Diagnosis Code Columns: {len(diag_cols)}")
        
        # Unique members
        unique_members = claims_df['DESYNPUF_ID'].nunique()
        print(f"  Unique Members: {unique_members:,}")
        
        # Date range
        if 'CLM_FROM_DT' in claims_df.columns:
            date_range = claims_df['CLM_FROM_DT'].agg(['min', 'max'])
            print(f"  Date Range: {date_range['min']} to {date_range['max']}")


## 🔧 Data Preprocessing & Feature Engineering

### Step 2: Clean and Preprocess Data

We'll use our HEDIS-compliant preprocessing pipeline that:
- ✅ Parses dates correctly (Dec 31 measurement year end)
- ✅ Handles missing values appropriately
- ✅ Creates diabetes diagnosis flags
- ✅ Validates data quality


In [None]:
# Preprocess the data
print("🔄 Preprocessing CMS data...")
try:
    processed_data = preprocess_cms_data(raw_data, measurement_year=2008)
    print("✅ Data preprocessing completed!")
    
    # Display preprocessing results (PHI-safe)
    print("\n📊 Preprocessed Data Summary:")
    for data_type, df in processed_data.items():
        print(f"  {data_type.title()}: {len(df):,} records")
        
        # Show new columns created during preprocessing
        if data_type == 'beneficiary' and 'age_at_my_end' in df.columns:
            age_stats = df['age_at_my_end'].describe()
            print(f"    Age Range: {age_stats['min']:.0f} - {age_stats['max']:.0f} years")
            print(f"    Average Age: {age_stats['mean']:.1f} years")
        
        if data_type in ['inpatient', 'outpatient'] and 'has_diabetes_dx' in df.columns:
            diabetes_claims = df['has_diabetes_dx'].sum()
            print(f"    Claims with Diabetes DX: {diabetes_claims:,}")
            
except Exception as e:
    print(f"❌ Error in preprocessing: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Create HEDIS-compliant features
print("🔄 Creating HEDIS GSD features...")
try:
    features_df = create_hedis_gsd_features(processed_data, measurement_year=2008)
    print("✅ Feature engineering completed!")
    
    # Display feature summary (PHI-safe)
    print(f"\n📊 Feature Summary:")
    print(f"  Total Features: {len(features_df.columns) - 1}")  # Exclude DESYNPUF_ID
    print(f"  Total Members: {len(features_df):,}")
    
    # Diabetes indicators
    if 'has_diabetes_comprehensive' in features_df.columns:
        diabetes_members = features_df['has_diabetes_comprehensive'].sum()
        print(f"  Members with Diabetes: {diabetes_members:,}")
        print(f"  Diabetes Rate: {diabetes_members/len(features_df)*100:.1f}%")
    
    # Age distribution
    if 'age_at_my_end' in features_df.columns:
        age_stats = features_df['age_at_my_end'].describe()
        print(f"\n👥 Age Distribution:")
        print(f"  Range: {age_stats['min']:.0f} - {age_stats['max']:.0f} years")
        print(f"  Average: {age_stats['mean']:.1f} years")
        print(f"  Median: {age_stats['50%']:.1f} years")
        
        # HEDIS age range (18-75)
        hedis_age_mask = (features_df['age_at_my_end'] >= 18) & (features_df['age_at_my_end'] <= 75)
        hedis_members = hedis_age_mask.sum()
        print(f"  HEDIS Age Range (18-75): {hedis_members:,} members")
    
    # Feature categories
    print(f"\n🔧 Feature Categories:")
    feature_categories = {
        'Demographics': [col for col in features_df.columns if any(x in col.lower() for x in ['age', 'sex', 'race', 'state', 'esrd'])],
        'Comorbidities': [col for col in features_df.columns if any(x in col.lower() for x in ['diabetes', 'ckd', 'cvd', 'retinopathy'])],
        'Utilization': [col for col in features_df.columns if any(x in col.lower() for x in ['claim', 'payment', 'utilization'])],
        'Other': [col for col in features_df.columns if col not in ['DESYNPUF_ID'] and not any(x in col.lower() for x in ['age', 'sex', 'race', 'state', 'esrd', 'diabetes', 'ckd', 'cvd', 'retinopathy', 'claim', 'payment', 'utilization'])]
    }
    
    for category, features in feature_categories.items():
        if features:
            print(f"  {category}: {len(features)} features")
            
except Exception as e:
    print(f"❌ Error in feature engineering: {e}")
    import traceback
    traceback.print_exc()
