In [7]:
claims_df = pd.read_csv("uk_pmi_claims_200k.csv")
memebrship_df = pd.read_csv("uk_pmi_membership_120k.csv")

In [8]:
claims_df = claims_df.rename(columns = {'Claimant Unique ID':'claimant unique ID'})

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Define disease categories based on ICD-10 or your Condition Category
DISEASE_DEFINITIONS = {
    'diabetes': {
        'keywords': ['DIABETES', 'DIABETIC', 'HYPERGLYCEMIA', 'E10', 'E11'],
        'exclude_keywords': ['PRE-DIABETES'],  # Don't count existing pre-diabetes
        'condition_categories': ['Endocrine', 'Metabolic']
    },
    'cardiovascular': {
        'keywords': ['CARDIAC', 'HEART', 'CARDIOVASCULAR', 'HYPERTENSION', 
                     'CORONARY', 'MYOCARDIAL', 'ANGINA', 'I20', 'I21', 'I50'],
        'condition_categories': ['Cardiovascular', 'Circulatory']
    },
    'cancer': {
        'keywords': ['CANCER', 'CARCINOMA', 'MALIGNANT', 'NEOPLASM', 'TUMOR', 
                     'ONCOLOGY', 'C0', 'C1', 'C2', 'C3', 'C4', 'C5'],
        'condition_categories': ['Neoplasms', 'Cancer']
    },
    'respiratory': {
        'keywords': ['COPD', 'ASTHMA', 'PNEUMONIA', 'BRONCHITIS', 'RESPIRATORY',
                     'J40', 'J41', 'J42', 'J43', 'J44', 'J45'],
        'condition_categories': ['Respiratory']
    },
    'mental_health': {
        'keywords': ['DEPRESSION', 'ANXIETY', 'MENTAL', 'PSYCHIATRIC', 
                     'BIPOLAR', 'SCHIZOPHRENIA', 'F32', 'F33', 'F41'],
        'condition_categories': ['Mental Health', 'Behavioral']
    },
    'musculoskeletal': {
        'keywords': ['ARTHRITIS', 'OSTEOPOROSIS', 'BACK PAIN', 'JOINT', 
                     'MUSCULOSKELETAL', 'M15', 'M16', 'M17', 'M80', 'M81'],
        'condition_categories': ['Musculoskeletal']
    },
    'kidney_disease': {
        'keywords': ['KIDNEY', 'RENAL', 'NEPHROPATHY', 'CKD', 'N18', 'N19'],
        'condition_categories': ['Genitourinary', 'Kidney']
    },
    'gastrointestinal': {
        'keywords': ['GASTRO', 'INTESTINAL', 'COLITIS', 'CROHN', 'IBS',
                     'K50', 'K51', 'K58'],
        'condition_categories': ['Digestive', 'Gastrointestinal']
    }
}

In [9]:
def create_disease_targets(claims_df, membership_df, observation_date='2024-06-01', 
                          prediction_window_months=12):
    """
    Create target variables for disease prediction
    
    Returns: DataFrame with one row per member and binary targets for each disease
    """
    
    observation_date = pd.to_datetime(observation_date)
    future_start = observation_date
    future_end = observation_date + pd.DateOffset(months=prediction_window_months)
    
    print(f"=== CREATING DISEASE TARGETS ===")
    print(f"Observation Date: {observation_date.strftime('%Y-%m-%d')}")
    print(f"Prediction Window: {prediction_window_months} months")
    print(f"Future Period: {future_start.strftime('%Y-%m-%d')} to {future_end.strftime('%Y-%m-%d')}\n")
    
    # Historical claims (before observation date) - to identify existing conditions
    historical_claims = claims_df[
        pd.to_datetime(claims_df['Paid Date']) < observation_date
    ].copy()
    
    # Future claims (after observation date) - to identify NEW diseases
    future_claims = claims_df[
        (pd.to_datetime(claims_df['Paid Date']) >= future_start) &
        (pd.to_datetime(claims_df['Paid Date']) < future_end)
    ].copy()
    
    # Initialize target dataframe
    targets = membership_df[['Unique ID']].copy()
    
    # For each disease, create target
    for disease_name, disease_config in DISEASE_DEFINITIONS.items():
        
        print(f"Processing: {disease_name}...")
        
        # Step 1: Identify members with EXISTING condition (exclude from target)
        historical_with_disease = identify_disease_in_claims(
            historical_claims, 
            disease_config
        )
        
        # Step 2: Identify members who DEVELOP condition in future
        future_with_disease = identify_disease_in_claims(
            future_claims,
            disease_config
        )
        
        # Step 3: Target = 1 if developed NEW disease (not existing)
        # Target = 0 if didn't develop OR already had it
        targets[f'will_develop_{disease_name}'] = targets['Unique ID'].apply(
            lambda x: 1 if (x in future_with_disease and x not in historical_with_disease) else 0
        )
        
        # Statistics
        existing_count = len(historical_with_disease)
        new_cases = targets[f'will_develop_{disease_name}'].sum()
        rate = targets[f'will_develop_{disease_name}'].mean()
        
        print(f"  - Existing cases (excluded): {existing_count:,}")
        print(f"  - NEW cases in future: {new_cases:,} ({rate:.2%})\n")
    
    return targets


def identify_disease_in_claims(claims_df, disease_config):
    """
    Identify members with a specific disease based on keywords and categories
    """
    
    member_ids = set()
    
    # Search in Condition Code and Condition Category
    for idx, row in claims_df.iterrows():
        
        # Get condition information (convert to uppercase for matching)
        condition_code = str(row.get('Condition Code', '')).upper()
        condition_category = str(row.get('Condition Category', '')).upper()
        
        # Check if any keyword matches
        keyword_match = any(
            keyword in condition_code or keyword in condition_category
            for keyword in disease_config['keywords']
        )
        
        # Check if category matches
        category_match = any(
            cat.upper() in condition_category
            for cat in disease_config.get('condition_categories', [])
        )
        
        # Check exclude keywords
        exclude_match = any(
            keyword in condition_code or keyword in condition_category
            for keyword in disease_config.get('exclude_keywords', [])
        )
        
        # Add member if keyword/category matches and NOT excluded
        if (keyword_match or category_match) and not exclude_match:
            member_ids.add(row['claimant unique ID'])
    
    return member_ids


# Execute
targets_df = create_disease_targets(
    claims_df, 
    memebrship_df,
    observation_date='2024-06-01',
    prediction_window_months=12
)

print("\n=== TARGET SUMMARY ===")
print(targets_df[[col for col in targets_df.columns if 'will_develop' in col]].sum())


=== CREATING DISEASE TARGETS ===
Observation Date: 2024-06-01
Prediction Window: 12 months
Future Period: 2024-06-01 to 2025-06-01

Processing: diabetes...
  - Existing cases (excluded): 3,821
  - NEW cases in future: 960 (0.80%)

Processing: cardiovascular...
  - Existing cases (excluded): 8,094
  - NEW cases in future: 2,044 (1.70%)

Processing: cancer...
  - Existing cases (excluded): 9,707
  - NEW cases in future: 2,311 (1.93%)

Processing: respiratory...
  - Existing cases (excluded): 2,824
  - NEW cases in future: 771 (0.64%)

Processing: mental_health...
  - Existing cases (excluded): 6,679
  - NEW cases in future: 1,644 (1.37%)

Processing: musculoskeletal...
  - Existing cases (excluded): 12,786
  - NEW cases in future: 3,163 (2.64%)

Processing: kidney_disease...
  - Existing cases (excluded): 0
  - NEW cases in future: 0 (0.00%)

Processing: gastrointestinal...
  - Existing cases (excluded): 6,803
  - NEW cases in future: 1,672 (1.39%)


=== TARGET SUMMARY ===
will_develop_d

In [None]:
def create_disease_risk_features(claims_df, membership_df, observation_date):
    """
    Create features that predict disease development
    """
    
    observation_date = pd.to_datetime(observation_date)
    historical_claims = claims_df[
        pd.to_datetime(claims_df['Paid Date']) < observation_date
    ]
    
    features = membership_df[['Unique ID']].copy()
    
    # === DEMOGRAPHIC RISK FACTORS ===
    features['age'] = 2024 - membership_df['Year of Birth']
    features['is_male'] = (membership_df['Gender'] == 'Male').astype(int)
    features['is_female'] = (membership_df['Gender'] == 'Femal').astype(int)
    features['is_other'] = (membership_df['Gender'] == 'Other').astype(int)
    features['age_squared'] = features['age'] ** 2  # Non-linear age effect
    
    # === GENERAL HEALTH INDICATORS ===
    member_claims = historical_claims.groupby('claimant unique ID')
    
    features['total_claims_count'] = member_claims.size()
    features['total_claim_amount'] = member_claims['Claim Amount'].sum()
    features['unique_conditions_count'] = member_claims['Condition Code'].nunique()
    features['avg_claim_amount'] = member_claims['Claim Amount'].mean()
    
    # === DISEASE-SPECIFIC RISK FACTORS ===
    
    # Diabetes Risk Factors
    features['has_prediabetes'] = identify_condition_presence(
        historical_claims, ['PRE-DIABETES', 'PREDIABETIC', 'IMPAIRED GLUCOSE']
    )
    features['has_obesity'] = identify_condition_presence(
        historical_claims, ['OBESITY', 'BMI', 'OVERWEIGHT']
    )
    features['has_metabolic_syndrome'] = identify_condition_presence(
        historical_claims, ['METABOLIC SYNDROME', 'DYSLIPIDEMIA']
    )
    
    # Cardiovascular Risk Factors
    features['has_hypertension'] = identify_condition_presence(
        historical_claims, ['HYPERTENSION', 'HIGH BLOOD PRESSURE', 'I10']
    )
    features['has_high_cholesterol'] = identify_condition_presence(
        historical_claims, ['HYPERLIPIDEMIA', 'CHOLESTEROL', 'E78']
    )
    features['family_history_cvd'] = 0  # Would need external data source
    
    # Cancer Risk Factors
    features['has_precancerous_lesion'] = identify_condition_presence(
        historical_claims, ['POLYP', 'DYSPLASIA', 'PRECANCEROUS']
    )
    features['screening_claims_count'] = count_screening_claims(historical_claims)
    features['biopsy_count'] = count_procedure_claims(
        historical_claims, ['BIOPSY']
    )
    
    # Respiratory Risk Factors
    features['has_smoking_history'] = identify_condition_presence(
        historical_claims, ['SMOKING', 'TOBACCO', 'NICOTINE']
    )
    features['has_chronic_cough'] = identify_condition_presence(
        historical_claims, ['CHRONIC COUGH', 'PERSISTENT COUGH']
    )
    
    # Mental Health Risk Factors
    features['has_stress_related'] = identify_condition_presence(
        historical_claims, ['STRESS', 'BURNOUT', 'ADJUSTMENT DISORDER']
    )
    features['has_sleep_disorder'] = identify_condition_presence(
        historical_claims, ['INSOMNIA', 'SLEEP DISORDER', 'APNEA']
    )
    
    # Musculoskeletal Risk Factors
    features['has_chronic_pain'] = identify_condition_presence(
        historical_claims, ['CHRONIC PAIN', 'PAIN SYNDROME']
    )
    features['physiotherapy_count'] = count_treatment_type(
        historical_claims, ['PHYSIOTHERAPY', 'PHYSICAL THERAPY']
    )
    
    # Kidney Disease Risk Factors
    features['has_diabetes'] = identify_condition_presence(
        historical_claims, ['DIABETES', 'DIABETIC']
    )
    features['has_hypertension_kidney'] = features['has_hypertension']
    
    # Gastrointestinal Risk Factors
    features['has_ibs'] = identify_condition_presence(
        historical_claims, ['IBS', 'IRRITABLE BOWEL']
    )
    features['endoscopy_count'] = count_procedure_claims(
        historical_claims, ['ENDOSCOPY', 'COLONOSCOPY']
    )
    
    # === LIFESTYLE PROXIES (from claims data) ===
    features['gp_visit_frequency'] = count_treatment_type(
        historical_claims, ['GENERAL PRACTICE', 'GP', 'CONSULTATION']
    )
    features['specialist_visit_count'] = count_treatment_type(
        historical_claims, ['SPECIALIST', 'CONSULTANT']
    )
    features['emergency_visit_count'] = count_treatment_type(
        historical_claims, ['EMERGENCY', 'A&E', 'ER']
    )
    
    # === MEDICATION INDICATORS ===
    features['medication_claims_count'] = count_treatment_type(
        historical_claims, ['MEDICATION', 'PRESCRIPTION', 'PHARMACY']
    )
    
    # === TEMPORAL FEATURES ===
    features['days_since_last_claim'] = calculate_days_since_last_claim(
        historical_claims, observation_date
    )
    features['claim_frequency_6m'] = calculate_recent_frequency(
        historical_claims, observation_date, months=6
    )
    
    # Fill NaN with 0
    features = features.fillna(0)
    
    return features


def identify_condition_presence(claims_df, keywords):
    """Check if member has any claim with specified keywords"""
    members_with_condition = set()
    
    for idx, row in claims_df.iterrows():
        condition_text = (
            str(row.get('Condition Code', '')) + ' ' + 
            str(row.get('Condition Category', ''))
        ).upper()
        
        if any(keyword.upper() in condition_text for keyword in keywords):
            members_with_condition.add(row['claimant unique ID'])
    
    return claims_df['claimant unique ID'].map(
        lambda x: 1 if x in members_with_condition else 0
    )


def count_screening_claims(claims_df):
    """Count preventive screening claims"""
    screening_keywords = [
        'SCREENING', 'MAMMOGRAM', 'COLONOSCOPY', 'PSA',
        'HEALTH CHECK', 'ANNUAL EXAM'
    ]
    
    screening_claims = claims_df[
        claims_df['Treatment Type'].str.upper().str.contains(
            '|'.join(screening_keywords), na=False
        )
    ]
    
    return screening_claims.groupby('claimant unique ID').size()


def count_treatment_type(claims_df, keywords):
    """Count claims with specific treatment types"""
    matching_claims = claims_df[
        claims_df['Treatment Type'].str.upper().str.contains(
            '|'.join([k.upper() for k in keywords]), na=False
        )
    ]
    return matching_claims.groupby('claimant unique ID').size()


def count_procedure_claims(claims_df, keywords):
    """Count specific procedures"""
    # Similar to count_treatment_type
    return count_treatment_type(claims_df, keywords)


def calculate_days_since_last_claim(claims_df, observation_date):
    """Calculate days since most recent claim"""
    last_claim_dates = claims_df.groupby('claimant unique ID')['Paid Date'].max()
    days_since = (observation_date - pd.to_datetime(last_claim_dates)).dt.days
    return days_since


def calculate_recent_frequency(claims_df, observation_date, months=6):
    """Calculate claim frequency in recent months"""
    cutoff_date = observation_date - pd.DateOffset(months=months)
    recent_claims = claims_df[pd.to_datetime(claims_df['Paid Date']) >= cutoff_date]
    return recent_claims.groupby('claimant unique ID').size()


# Execute
features_df = create_disease_risk_features(
    claims_df,
    memebrship_df,
    observation_date='2024-06-01'
)

print(f"Features created: {features_df.shape[1]}")
print(features_df.head())

Features created: 33
        Unique ID  age  is_male  age_squared  total_claims_count  \
0  MEM00000001-01   56        1         3136                 0.0   
1  MEM00000001-02   57        0         3249                 0.0   
2  MEM00000002-01   48        0         2304                 0.0   
3  MEM00000002-03   19        1          361                 0.0   
4  MEM00000003-01   38        1         1444                 0.0   

   total_claim_amount  unique_conditions_count  avg_claim_amount  \
0                 0.0                      0.0               0.0   
1                 0.0                      0.0               0.0   
2                 0.0                      0.0               0.0   
3                 0.0                      0.0               0.0   
4                 0.0                      0.0               0.0   

   has_prediabetes  has_obesity  ...  has_diabetes  has_hypertension_kidney  \
0              0.0          0.0  ...           0.0                      0.0   
1  

In [12]:
features_df.head(5)

Unnamed: 0,Unique ID,age,is_male,age_squared,total_claims_count,total_claim_amount,unique_conditions_count,avg_claim_amount,has_prediabetes,has_obesity,...,has_diabetes,has_hypertension_kidney,has_ibs,endoscopy_count,gp_visit_frequency,specialist_visit_count,emergency_visit_count,medication_claims_count,days_since_last_claim,claim_frequency_6m
0,MEM00000001-01,56,1,3136,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MEM00000001-02,57,0,3249,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,MEM00000002-01,48,0,2304,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MEM00000002-03,19,1,361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MEM00000003-01,38,1,1444,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
targets_df.sample(5)

Unnamed: 0,Unique ID,will_develop_diabetes,will_develop_cardiovascular,will_develop_cancer,will_develop_respiratory,will_develop_mental_health,will_develop_musculoskeletal,will_develop_kidney_disease,will_develop_gastrointestinal
90783,MEM00040188-03,0,0,0,0,0,0,0,0
27697,MEM00012197-01,0,0,0,0,0,0,0,0
87757,MEM00038856-03,0,0,0,0,0,0,0,0
47126,MEM00020777-01,0,0,1,0,0,0,0,0
16325,MEM00007182-03,0,0,0,0,0,0,0,0


In [13]:
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

# Merge features and targets
model_data = features_df.merge(targets_df, on='Unique ID', how='inner')

# Separate features and targets
feature_cols = [col for col in features_df.columns if col != 'Unique ID']
target_cols = [col for col in targets_df.columns if 'will_develop' in col]

X = model_data[feature_cols]
y = model_data[target_cols]

print("=== MODEL DATA ===")
print(f"Features: {X.shape[1]}")
print(f"Target Diseases: {y.shape[1]}")
print(f"Samples: {X.shape[0]:,}\n")

# Train-test split (temporal if possible)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train separate model for each disease
models = {}
results = {}

for disease in target_cols:
    disease_name = disease.replace('will_develop_', '')
    
    print(f"\n{'='*60}")
    print(f"Training model for: {disease_name.upper()}")
    print(f"{'='*60}")
    
    # Check class balance
    positive_rate = y_train[disease].mean()
    print(f"Positive rate in training: {positive_rate:.2%}")
    
    if positive_rate < 0.01:  # Less than 1%
        print(f"âš  Very rare disease (<1%). Consider different approach or collect more data.")
        continue
    
    # Train model
    model = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        num_leaves=31,
        class_weight='balanced',
        random_state=42,
        verbose=-1
    )
    
    model.fit(X_train, y_train[disease])
    
    # Predict
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate
    roc_auc = roc_auc_score(y_test[disease], y_pred_proba)
    
    # Store
    models[disease_name] = model
    results[disease_name] = {
        'roc_auc': roc_auc,
        'positive_rate': positive_rate,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 5 Risk Factors for {disease_name}:")
    print(feature_importance.head(5).to_string(index=False))

# Summary
print(f"\n\n{'='*60}")
print("OVERALL MODEL PERFORMANCE")
print(f"{'='*60}")

summary = pd.DataFrame({
    'Disease': list(results.keys()),
    'ROC-AUC': [results[d]['roc_auc'] for d in results.keys()],
    'Base Rate': [results[d]['positive_rate'] for d in results.keys()]
}).sort_values('ROC-AUC', ascending=False)

print(summary.to_string(index=False))

=== MODEL DATA ===
Features: 32
Target Diseases: 8
Samples: 120,000


Training model for: DIABETES
Positive rate in training: 0.81%
âš  Very rare disease (<1%). Consider different approach or collect more data.

Training model for: CARDIOVASCULAR
Positive rate in training: 1.74%
ROC-AUC: 0.6276

Top 5 Risk Factors for cardiovascular:
                feature  importance
                    age        2997
                is_male         845
  days_since_last_claim           0
medication_claims_count           0
  emergency_visit_count           0

Training model for: CANCER
Positive rate in training: 1.92%
ROC-AUC: 0.5612

Top 5 Risk Factors for cancer:
                feature  importance
                    age        2837
                is_male         717
  days_since_last_claim           0
medication_claims_count           0
  emergency_visit_count           0

Training model for: RESPIRATORY
Positive rate in training: 0.64%
âš  Very rare disease (<1%). Consider different approach 

In [75]:
def generate_disease_predictions(models, X_new, member_ids):
    """
    Generate disease risk predictions for new members
    """
    
    predictions = pd.DataFrame({'Member_ID': member_ids})
    
    print(f"=== GENERATING DISEASE PREDICTIONS ===")
    print(f"Members to score: {len(member_ids):,}")
    print(f"Diseases modeled: {len(models)}\n")
    
    for disease_name, model in models.items():
        print(f"Scoring: {disease_name}...", end=' ')
        
        # Predict probability
        proba = model.predict_proba(X_new)[:, 1]
        
        # Add to dataframe with correct column names
        predictions[f'{disease_name}_risk_prob'] = proba.round(4)
        predictions[f'{disease_name}_risk_score'] = (proba * 100).round(2)
        predictions[f'{disease_name}_risk_category'] = pd.cut(
            proba * 100,
            bins=[0, 20, 40, 60, 100],
            labels=['Low', 'Medium', 'High', 'Very High']
        )
        
        print(f"âœ“ (Avg risk: {proba.mean()*100:.1f}%)")
    
    return predictions


def assign_disease_interventions(predictions_df):
    """
    Assign preventive interventions based on disease risks
    """
    
    print("\n=== ASSIGNING INTERVENTIONS ===\n")
    
    interventions = predictions_df.copy()
    interventions['recommended_interventions'] = ''
    interventions['priority_level'] = 0
    
    # Get all disease columns (those ending with _risk_score)
    disease_cols = [col for col in predictions_df.columns if col.endswith('_risk_score')]
    
    for idx, row in interventions.iterrows():
        recommended = []
        priority = 0
        
        # Check each disease
        for disease_col in disease_cols:
            disease_name = disease_col.replace('_risk_score', '')
            risk_score = row[disease_col]
            
            # Diabetes prevention
            if 'diabetes' in disease_name and risk_score > 40:
                recommended.append(f"ðŸ”¹ Diabetes Prevention: Nutrition counseling, HbA1c test, exercise plan")
                priority = max(priority, 2 if risk_score > 60 else 1)
            
            # Cardiovascular prevention
            elif 'cardiovascular' in disease_name and risk_score > 40:
                recommended.append(f"Cardiac Health Check: Lipid panel, blood pressure monitoring, ECG")
                priority = max(priority, 3 if risk_score > 60 else 2)
            
            # Cancer screening
            elif 'cancer' in disease_name and risk_score > 30:
                recommended.append(f"Enhanced Cancer Screening: Age-appropriate screenings (mammogram/colonoscopy/PSA)")
                priority = max(priority, 3 if risk_score > 50 else 2)
            
            # Mental health support
            elif 'mental_health' in disease_name and risk_score > 50:
                recommended.append(f"Mental Health Assessment: Counseling referral, stress management program")
                priority = max(priority, 2)
            
            # Respiratory monitoring
            elif 'respiratory' in disease_name and risk_score > 40:
                recommended.append(f"Pulmonary Function Test: Smoking cessation support if applicable")
                priority = max(priority, 2)
            
            # Musculoskeletal care
            elif 'musculoskeletal' in disease_name and risk_score > 50:
                recommended.append(f" Musculoskeletal Assessment: Physiotherapy, pain management")
                priority = max(priority, 1)
            
            # Kidney monitoring
            elif 'kidney' in disease_name and risk_score > 40:
                recommended.append(f"Kidney Function Test: Creatinine, eGFR monitoring")
                priority = max(priority, 2)
            
            # Gastrointestinal care
            elif 'gastrointestinal' in disease_name and risk_score > 40:
                recommended.append(f"GI Assessment: Dietary review, endoscopy if indicated")
                priority = max(priority, 1)
        
        # Store recommendations
        if recommended:
            interventions.at[idx, 'recommended_interventions'] = '\n'.join(recommended)
            interventions.at[idx, 'priority_level'] = priority
        else:
            interventions.at[idx, 'recommended_interventions'] = 'Routine wellness check'
            interventions.at[idx, 'priority_level'] = 0
    
    # Add priority labels
    interventions['priority_label'] = interventions['priority_level'].map({
        0: 'Routine',
        1: 'Low Priority',
        2: 'Medium Priority',
        3: 'High Priority'
    })
    
    # Count interventions
    priority_counts = interventions['priority_label'].value_counts()
    print("Intervention Priority Distribution:")
    print(priority_counts)
    print()
    
    return interventions


# Generate predictions
print("=" * 70)
print(" " * 20 + "DISEASE PREDICTION PIPELINE")
print("=" * 70)

predictions = generate_disease_predictions(
    models,
    X_test,
    model_data.loc[X_test.index, 'Unique ID'].values
)

print(f"\nâœ“ Predictions generated for {len(predictions):,} members")

# Assign interventions
interventions = assign_disease_interventions(predictions)

# Analyze results
print("\n=== RISK DISTRIBUTION ===\n")

# Get disease score columns
disease_score_cols = [col for col in predictions.columns if col.endswith('_risk_score')]

for col in disease_score_cols:
    disease_name = col.replace('_risk_score', '').replace('_', ' ').title()
    
    high_risk_count = (predictions[col] > 60).sum()
    medium_risk_count = (predictions[col].between(40, 60)).sum()
    avg_risk = predictions[col].mean()
    
    print(f"{disease_name:25} | Avg: {avg_risk:>5.1f}% | High Risk: {high_risk_count:>4} | Medium Risk: {medium_risk_count:>4}")

# Identify high-risk members (any disease > 60)
high_risk_mask = (predictions[disease_score_cols] > 60).any(axis=1)
high_risk_members = interventions[high_risk_mask].copy()

print(f"\n{'='*70}")
print(f"HIGH-RISK MEMBERS IDENTIFIED: {len(high_risk_members):,}")
print(f"{'='*70}\n")

# Display sample
display_cols = ['Member_ID'] + disease_score_cols[:4] + ['priority_label']
print("Sample of High-Risk Members:")
print(high_risk_members[display_cols].head(10).to_string(index=False))

# Save outputs
print("\n=== SAVING OUTPUTS ===\n")

# 1. All predictions
predictions.to_csv('disease_risk_predictions_all.csv', index=False)
print("âœ“ Saved: disease_risk_predictions_all.csv")

# 2. High-risk members only
high_risk_members.to_csv('disease_risk_high_priority.csv', index=False)
print(f"âœ“ Saved: disease_risk_high_priority.csv ({len(high_risk_members):,} members)")

# 3. Intervention summary by priority
intervention_summary = interventions.groupby('priority_label').agg({
    'Member_ID': 'count'
}).reset_index()
intervention_summary.columns = ['Priority', 'Member Count']
intervention_summary.to_csv('intervention_summary.csv', index=False)
print("âœ“ Saved: intervention_summary.csv")

# 4. Detailed report for top 100 highest risk
# Calculate max risk across all diseases
interventions['max_risk_score'] = interventions[disease_score_cols].max(axis=1)
interventions['highest_risk_disease'] = interventions[disease_score_cols].idxmax(axis=1).str.replace('_risk_score', '').str.replace('_', ' ').str.title()

top_100 = interventions.nlargest(100, 'max_risk_score')
top_100_export = top_100[[
    'Member_ID', 
    'max_risk_score', 
    'highest_risk_disease',
    'priority_label',
    'recommended_interventions'
] + disease_score_cols]

top_100_export.to_csv('top_100_highest_risk_members.csv', index=False)
print("âœ“ Saved: top_100_highest_risk_members.csv\n")

print("=" * 70)
print("DISEASE PREDICTION PIPELINE COMPLETE")
print("=" * 70)

                    DISEASE PREDICTION PIPELINE
=== GENERATING DISEASE PREDICTIONS ===
Members to score: 24,000
Diseases modeled: 5

Scoring: cardiovascular... âœ“ (Avg risk: 46.7%)
Scoring: cancer... âœ“ (Avg risk: 48.4%)
Scoring: mental_health... âœ“ (Avg risk: 49.2%)
Scoring: musculoskeletal... âœ“ (Avg risk: 49.3%)
Scoring: gastrointestinal... âœ“ (Avg risk: 48.0%)

âœ“ Predictions generated for 24,000 members

=== ASSIGNING INTERVENTIONS ===

Intervention Priority Distribution:
priority_label
Medium Priority    14110
High Priority       9723
Low Priority         152
Routine               15
Name: count, dtype: int64


=== RISK DISTRIBUTION ===

Cardiovascular            | Avg:  46.7% | High Risk: 2920 | Medium Risk: 10288
Cancer                    | Avg:  48.4% | High Risk: 1225 | Medium Risk: 20643
Mental Health             | Avg:  49.2% | High Risk:  666 | Medium Risk: 21774
Musculoskeletal           | Avg:  49.3% | High Risk:  436 | Medium Risk: 23030
Gastrointestinal          

In [76]:
top_100_export.sample(5)

Unnamed: 0,Member_ID,max_risk_score,highest_risk_disease,priority_label,recommended_interventions,cardiovascular_risk_score,cancer_risk_score,mental_health_risk_score,musculoskeletal_risk_score,gastrointestinal_risk_score
16813,MEM00006392-01,93.36,Cardiovascular,High Priority,"Cardiac Health Check: Lipid panel, blood press...",93.36,81.71,0.59,47.88,85.02
4993,MEM00044323-02,80.44,Mental Health,High Priority,"Cardiac Health Check: Lipid panel, blood press...",79.6,62.25,80.44,65.29,67.73
23298,MEM00005280-02,94.1,Cancer,High Priority,"Cardiac Health Check: Lipid panel, blood press...",78.35,94.1,0.59,15.32,0.98
6208,MEM00022240-02,80.11,Cardiovascular,High Priority,"Cardiac Health Check: Lipid panel, blood press...",80.11,64.27,0.17,64.07,43.02
12941,MEM00039989-02,92.76,Gastrointestinal,High Priority,"Cardiac Health Check: Lipid panel, blood press...",90.86,82.8,4.22,8.72,92.76


In [77]:
predictions['Member_ID'].nunique()

24000