In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from itertools import combinations

# Algorithm imports
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

def prepare_enhanced_data(df, target_col='early_sexual_debut', test_size=0.15, val_size=0.15, random_state=42):
    """
    Enhanced data preparation with advanced feature engineering
    """
    
    print("="*80)
    print("PHASE 3.1: ENHANCED DATA PREPARATION WITH FEATURE ENGINEERING")  
    print("="*80)
    
    # Remove rows with missing target
    df_clean = df.dropna(subset=[target_col]).copy()
    print(f"Dataset after removing missing targets: {df_clean.shape}")
    
    # CRITICAL: Define all columns to exclude from features
    id_columns = ['caseid', 'household_id', 'v001', 'v002']
    leakage_variables = ['v525', 'v512', 'v511', 'v212']
    target_variables = [target_col]
    exclude_columns = id_columns + leakage_variables + target_variables
    
    print(f"\nCOLUMNS EXCLUDED FROM FEATURES:")
    print(f"  ID variables: {id_columns}")
    print(f"  Leakage variables: {leakage_variables}")
    print(f"  Target variable: {target_variables}")
    print(f"  Total excluded: {len(exclude_columns)}")
    
    # Create feature column list
    feature_columns = [col for col in df_clean.columns 
                      if col not in exclude_columns]
    
    # VERIFICATION: Ensure target and leakage variables are not in features
    problematic_vars = [var for var in exclude_columns if var in feature_columns]
    if problematic_vars:
        raise ValueError(f"CRITICAL ERROR: Problematic variables found in features: {problematic_vars}")
    
    print(f"✓ Confirmed: All problematic variables excluded from features")
    print(f"✓ Total features available: {len(feature_columns)}")
    
    # Prepare features (X) and target (y) separately
    X = df_clean[feature_columns].copy()
    y = df_clean[target_col].copy()
    
    # Enhanced missing value handling with KNN imputation
    missing_features = X.isnull().sum()
    if missing_features.sum() > 0:
        print(f"\nHandling missing values in {(missing_features > 0).sum()} features...")
        from sklearn.impute import KNNImputer
        
        knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
        X_numeric = X.select_dtypes(include=[np.number])
        X_categorical = X.select_dtypes(exclude=[np.number])
        
        if len(X_numeric.columns) > 0:
            X_numeric_imputed = pd.DataFrame(
                knn_imputer.fit_transform(X_numeric),
                columns=X_numeric.columns,
                index=X_numeric.index
            )
            X[X_numeric.columns] = X_numeric_imputed
        
        if len(X_categorical.columns) > 0:
            from sklearn.impute import SimpleImputer
            cat_imputer = SimpleImputer(strategy='most_frequent')
            X[X_categorical.columns] = cat_imputer.fit_transform(X[X_categorical.columns])
        
        print(f"✓ Advanced missing value imputation completed")
    
    # Convert target to integer
    y = y.astype(int)
    
    # Enhanced class distribution analysis
    class_counts = pd.Series(y).value_counts().sort_index()
    class_props = class_counts / len(y)
    
    print(f"\nTARGET VARIABLE DISTRIBUTION:")
    for class_val, count in class_counts.items():
        prop = class_props[class_val]
        label = "Early debut" if class_val == 1 else "Late debut"
        print(f"  {label} ({class_val}): {count:,} ({prop:.1%})")
    
    # Stratified data splitting
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, 
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )
    
    adjusted_val_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp,
        test_size=adjusted_val_size,
        stratify=y_temp,
        random_state=random_state
    )
    
    print(f"\nDATA SPLIT SUMMARY:")
    print(f"Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
    
    return {
        'X_train': X_train,
        'X_val': X_val, 
        'X_test': X_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test,
        'feature_columns': feature_columns,
        'class_distribution': class_counts,
        'excluded_columns': exclude_columns
    }

def advanced_feature_engineering(X_train, y_train, X_val, X_test=None):
    """
    Advanced feature engineering for better precision-recall balance
    """
    print(f"\nADVANCED FEATURE ENGINEERING")
    print("-" * 50)
    
    # Step 1: Remove highly correlated features
    print("Step 1: Removing highly correlated features...")
    numeric_features = X_train.select_dtypes(include=[np.number])
    correlation_matrix = numeric_features.corr().abs()
    
    # Find features with correlation > 0.85
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if correlation_matrix.iloc[i, j] > 0.85:
                high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j]))
    
    # Remove one feature from each highly correlated pair
    features_to_remove = set()
    for feat1, feat2 in high_corr_pairs:
        # Keep the feature with higher correlation to target
        corr1 = abs(np.corrcoef(X_train[feat1].fillna(X_train[feat1].median()), y_train)[0, 1])
        corr2 = abs(np.corrcoef(X_train[feat2].fillna(X_train[feat2].median()), y_train)[0, 1])
        
        if corr1 >= corr2:
            features_to_remove.add(feat2)
        else:
            features_to_remove.add(feat1)
    
    print(f"  Removing {len(features_to_remove)} highly correlated features")
    
    # Step 2: Feature selection using multiple methods
    print("Step 2: Advanced feature selection...")
    
    # Remove highly correlated features
    remaining_features = [col for col in X_train.columns if col not in features_to_remove]
    X_train_filtered = X_train[remaining_features]
    X_val_filtered = X_val[remaining_features]
    X_test_filtered = X_test[remaining_features] if X_test is not None else None
    
    # Use Random Forest for feature importance
    rf_selector = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf_selector.fit(X_train_filtered, y_train)
    
    # Get feature importances
    feature_importance = pd.DataFrame({
        'feature': X_train_filtered.columns,
        'importance': rf_selector.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Select top 25 features for better precision
    top_features = feature_importance.head(25)['feature'].tolist()
    print(f"  Selected top {len(top_features)} features")
    print(f"  Top 10 features: {top_features[:10]}")
    
    # Step 3: Create interaction features for top predictors
    print("Step 3: Creating interaction features...")
    
    # Select top 8 features for interactions to avoid explosion
    interaction_features = top_features[:8]
    X_train_selected = X_train_filtered[top_features]
    X_val_selected = X_val_filtered[top_features]
    X_test_selected = X_test_filtered[top_features] if X_test_filtered is not None else None
    
    # Create polynomial features for top 5 most important features
    top_5_features = top_features[:5]
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    
    X_train_poly = poly.fit_transform(X_train_filtered[top_5_features])
    X_val_poly = poly.transform(X_val_filtered[top_5_features])
    X_test_poly = poly.transform(X_test_filtered[top_5_features]) if X_test_filtered is not None else None
    
    # Combine original features with polynomial features
    poly_feature_names = poly.get_feature_names_out(top_5_features)
    
    # Convert to DataFrame and combine
    X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly_feature_names, index=X_train_selected.index)
    X_val_poly_df = pd.DataFrame(X_val_poly, columns=poly_feature_names, index=X_val_selected.index)
    
    if X_test_selected is not None:
        X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_feature_names, index=X_test_selected.index)
    
    # Select non-interaction terms from polynomial features (avoid original duplicates)
    original_feature_names = set(top_5_features)
    poly_only_features = [col for col in poly_feature_names if col not in original_feature_names]
    
    # Final feature combination: top 25 + polynomial interactions
    X_train_final = pd.concat([
        X_train_selected,
        X_train_poly_df[poly_only_features]
    ], axis=1)
    
    X_val_final = pd.concat([
        X_val_selected,
        X_val_poly_df[poly_only_features]
    ], axis=1)
    
    if X_test_selected is not None:
        X_test_final = pd.concat([
            X_test_selected,
            X_test_poly_df[poly_only_features]
        ], axis=1)
    else:
        X_test_final = None
    
    print(f"  Final feature count: {X_train_final.shape[1]}")
    print(f"  Added {len(poly_only_features)} interaction features")
    
    final_features = X_train_final.columns.tolist()
    
    return X_train_final, X_val_final, X_test_final, final_features

def precision_focused_sampling(X_train, y_train, strategy='conservative_oversample', random_state=42):
    """
    Precision-focused sampling strategy
    """
    print(f"\nPRECISION-FOCUSED SAMPLING: {strategy.upper()}")
    print("-" * 50)
    
    original_counts = pd.Series(y_train).value_counts()
    print(f"Original distribution: {dict(original_counts)}")
    
    if strategy == 'conservative_oversample':
        # Conservative oversampling: 1.3:1 ratio instead of 1:1
        minority_mask = y_train == 1
        majority_mask = y_train == 0
        
        minority_X = X_train[minority_mask]
        minority_y = y_train[minority_mask]
        majority_X = X_train[majority_mask]
        majority_y = y_train[majority_mask]
        
        n_minority = len(minority_y)
        n_majority = len(majority_y)
        target_minority = int(n_majority / 1.3)  # 1.3:1 ratio
        
        if target_minority > n_minority:
            np.random.seed(random_state)
            bootstrap_indices = np.random.choice(n_minority, 
                                               size=target_minority - n_minority, 
                                               replace=True)
            
            additional_X = minority_X.iloc[bootstrap_indices]
            additional_y = minority_y.iloc[bootstrap_indices]
            
            X_resampled = pd.concat([majority_X, minority_X, additional_X]).reset_index(drop=True)
            y_resampled = pd.concat([majority_y, minority_y, additional_y]).reset_index(drop=True)
        else:
            X_resampled = pd.concat([majority_X, minority_X]).reset_index(drop=True)
            y_resampled = pd.concat([majority_y, minority_y]).reset_index(drop=True)
            
    elif strategy == 'stratified_oversample':
        # Stratified oversampling based on feature importance
        from sklearn.cluster import KMeans
        
        # Cluster minority class samples
        minority_mask = y_train == 1
        minority_X = X_train[minority_mask]
        minority_y = y_train[minority_mask]
        
        # Create clusters within minority class
        kmeans = KMeans(n_clusters=5, random_state=random_state)
        clusters = kmeans.fit_predict(minority_X)
        
        # Oversample each cluster proportionally
        oversampled_X_list = []
        oversampled_y_list = []
        
        for cluster_id in np.unique(clusters):
            cluster_mask = clusters == cluster_id
            cluster_X = minority_X[cluster_mask]
            cluster_y = minority_y[cluster_mask]
            
            # Oversample this cluster
            n_samples = len(cluster_X)
            additional_samples = max(1, int(n_samples * 0.3))  # 30% additional
            
            if additional_samples > 0:
                bootstrap_indices = np.random.choice(n_samples, 
                                                   size=additional_samples, 
                                                   replace=True)
                
                additional_X = cluster_X.iloc[bootstrap_indices]
                additional_y = cluster_y.iloc[bootstrap_indices]
                
                oversampled_X_list.append(pd.concat([cluster_X, additional_X]))
                oversampled_y_list.append(pd.concat([cluster_y, additional_y]))
            else:
                oversampled_X_list.append(cluster_X)
                oversampled_y_list.append(cluster_y)
        
        # Combine all oversampled clusters with majority class
        majority_X = X_train[y_train == 0]
        majority_y = y_train[y_train == 0]
        
        minority_oversampled_X = pd.concat(oversampled_X_list).reset_index(drop=True)
        minority_oversampled_y = pd.concat(oversampled_y_list).reset_index(drop=True)
        
        X_resampled = pd.concat([majority_X, minority_oversampled_X]).reset_index(drop=True)
        y_resampled = pd.concat([majority_y, minority_oversampled_y]).reset_index(drop=True)
        
    else:
        print("No sampling applied")
        return X_train, y_train
    
    resampled_counts = pd.Series(y_resampled).value_counts()
    print(f"Resampled distribution: {dict(resampled_counts)}")
    print(f"✓ Precision-focused sampling completed")
    
    return X_resampled, y_resampled

def implement_precision_optimized_models(data_splits, random_state=42):
    """
    Implement precision-optimized models with advanced techniques
    """
    
    print("\n" + "="*80)
    print("PHASE 3.2: PRECISION-OPTIMIZED MODEL IMPLEMENTATION")
    print("="*80)
    
    X_train = data_splits['X_train']
    y_train = data_splits['y_train']
    X_val = data_splits['X_val'] 
    y_val = data_splits['y_val']
    X_test = data_splits.get('X_test', None)
    
    # Advanced feature engineering
    X_train_eng, X_val_eng, X_test_eng, engineered_features = advanced_feature_engineering(
        X_train, y_train, X_val, X_test
    )
    
    # Precision-focused sampling
    X_train_balanced, y_train_balanced = precision_focused_sampling(
        X_train_eng, y_train, strategy='conservative_oversample', random_state=random_state
    )
    
    # Calculate conservative class weights
    class_counts = pd.Series(y_train_balanced).value_counts().sort_index()
    class_weights = {
        0: 1.0,
        1: (class_counts[0] / class_counts[1]) * 1.5  # Conservative 1.5x multiplier
    }
    
    print(f"Conservative class weights: {class_weights}")
    
    models = {}
    
    # ================================================================
    # PRECISION-OPTIMIZED LOGISTIC REGRESSION WITH HYPERPARAMETER TUNING
    # ================================================================
    
    print(f"\n3.2.1 PRECISION-OPTIMIZED LOGISTIC REGRESSION")
    print("-" * 55)
    
    # Hyperparameter tuning for Logistic Regression
    scaler_lr = RobustScaler()
    X_train_scaled = scaler_lr.fit_transform(X_train_balanced)
    X_val_scaled = scaler_lr.transform(X_val_eng)
    
    # Grid search for best parameters
    lr_params = {
        'C': [0.5, 0.8, 1.0, 1.2, 1.5],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
    
    lr_base = LogisticRegression(
        class_weight=class_weights,
        random_state=random_state,
        max_iter=3000
    )
    
    lr_grid = GridSearchCV(lr_base, lr_params, cv=3, scoring='f1', n_jobs=-1)
    lr_grid.fit(X_train_scaled, y_train_balanced)
    
    # Get best model and calibrate
    best_lr = lr_grid.best_estimator_
    lr_model = CalibratedClassifierCV(best_lr, method='isotonic', cv=3)
    lr_model.fit(X_train_scaled, y_train_balanced)
    
    lr_val_proba = lr_model.predict_proba(X_val_scaled)[:, 1]
    lr_threshold = 0.45  # Conservative threshold for precision
    lr_val_pred = (lr_val_proba >= lr_threshold).astype(int)
    
    models['logistic_regression'] = {
        'model': lr_model,
        'scaler': scaler_lr,
        'val_predictions': lr_val_pred,
        'val_probabilities': lr_val_proba,
        'threshold': lr_threshold,
        'features': engineered_features,
        'requires_scaling': True
    }
    
    lr_metrics = {
        'recall': recall_score(y_val, lr_val_pred),
        'precision': precision_score(y_val, lr_val_pred),
        'f1': f1_score(y_val, lr_val_pred)
    }
    print(f"Tuned LR - Recall: {lr_metrics['recall']:.4f}, Precision: {lr_metrics['precision']:.4f}, F1: {lr_metrics['f1']:.4f}")
    print(f"Best params: {lr_grid.best_params_}")
    
    # ================================================================
    # PRECISION-OPTIMIZED RANDOM FOREST WITH CONSERVATIVE PARAMETERS
    # ================================================================
    
    print(f"\n3.2.2 PRECISION-OPTIMIZED RANDOM FOREST")
    print("-" * 50)
    
    # Conservative Random Forest parameters for better precision
    rf_model = RandomForestClassifier(
        n_estimators=600,       # More trees for stability
        max_depth=15,           # Reduced depth to prevent overfitting
        min_samples_split=25,   # Higher minimum samples for splits
        min_samples_leaf=10,    # Larger leaf size for generalization
        max_features='log2',    # Fewer features per split
        class_weight=class_weights,
        random_state=random_state,
        n_jobs=-1,
        criterion='gini',
        bootstrap=True,
        oob_score=True,
        max_samples=0.7         # Conservative bootstrap sampling
    )
    
    rf_model.fit(X_train_balanced, y_train_balanced)
    rf_val_proba = rf_model.predict_proba(X_val_eng)[:, 1]
    
    rf_threshold = 0.40  # Higher threshold for precision
    rf_val_pred = (rf_val_proba >= rf_threshold).astype(int)
    
    models['random_forest'] = {
        'model': rf_model,
        'scaler': None,
        'val_predictions': rf_val_pred,
        'val_probabilities': rf_val_proba,
        'threshold': rf_threshold,
        'features': engineered_features,
        'requires_scaling': False
    }
    
    rf_metrics = {
        'recall': recall_score(y_val, rf_val_pred),
        'precision': precision_score(y_val, rf_val_pred),
        'f1': f1_score(y_val, rf_val_pred)
    }
    print(f"Conservative RF - Recall: {rf_metrics['recall']:.4f}, Precision: {rf_metrics['precision']:.4f}, F1: {rf_metrics['f1']:.4f}")
    print(f"OOB Score: {rf_model.oob_score_:.4f}")
    
    # ================================================================
    # PRECISION-OPTIMIZED XGBOOST WITH REGULARIZATION
    # ================================================================
    
    print(f"\n3.2.3 PRECISION-OPTIMIZED XGBOOST")
    print("-" * 45)
    
    # Conservative XGBoost with heavy regularization
    scale_pos_weight = (class_counts[0] / class_counts[1]) * 1.8  # Reduced scaling
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=600,
        learning_rate=0.06,     # Lower learning rate
        max_depth=4,            # Shallow trees
        min_child_weight=6,     # Higher minimum child weight
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        random_state=random_state,
        eval_metric='aucpr',
        reg_alpha=0.4,          # Heavy L1 regularization
        reg_lambda=0.4,         # Heavy L2 regularization
        tree_method='hist',
        objective='binary:logistic',
        gamma=0.3               # Higher minimum split loss
    )
    
    xgb_model.fit(X_train_balanced, y_train_balanced,
                  eval_set=[(X_val_eng, y_val)],
                  verbose=False)
    
    xgb_val_proba = xgb_model.predict_proba(X_val_eng)[:, 1]
    xgb_threshold = 0.38  # Conservative threshold
    xgb_val_pred = (xgb_val_proba >= xgb_threshold).astype(int)
    
    models['xgboost'] = {
        'model': xgb_model,
        'scaler': None,
        'val_predictions': xgb_val_pred,
        'val_probabilities': xgb_val_proba,
        'threshold': xgb_threshold,
        'features': engineered_features,
        'requires_scaling': False
    }
    
    xgb_metrics = {
        'recall': recall_score(y_val, xgb_val_pred),
        'precision': precision_score(y_val, xgb_val_pred),
        'f1': f1_score(y_val, xgb_val_pred)
    }
    print(f"Regularized XGB - Recall: {xgb_metrics['recall']:.4f}, Precision: {xgb_metrics['precision']:.4f}, F1: {xgb_metrics['f1']:.4f}")
    
    # ================================================================
    # STACKING ENSEMBLE FOR MAXIMUM PRECISION
    # ================================================================
    
    print(f"\n3.2.4 STACKING ENSEMBLE FOR PRECISION OPTIMIZATION")
    print("-" * 60)
    
    # Create base models with conservative parameters
    base_models = [
        ('rf_conservative', RandomForestClassifier(
            n_estimators=400, max_depth=12, min_samples_split=30,
            min_samples_leaf=15, class_weight=class_weights, 
            random_state=random_state, n_jobs=-1
        )),
        ('xgb_conservative', xgb.XGBClassifier(
            n_estimators=400, learning_rate=0.08, max_depth=3,
            min_child_weight=8, scale_pos_weight=scale_pos_weight,
            reg_alpha=0.5, reg_lambda=0.5, random_state=random_state
        )),
        ('gbm_conservative', GradientBoostingClassifier(
            n_estimators=400, learning_rate=0.08, max_depth=3,
            min_samples_split=30, min_samples_leaf=15,
            random_state=random_state, subsample=0.8
        ))
    ]
    
    # Meta-learner optimized for precision
    meta_learner = LogisticRegression(
        C=0.8, penalty='l2', class_weight=class_weights,
        random_state=random_state
    )
    
    stacking_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_learner,
        cv=3,
        stack_method='predict_proba',
        n_jobs=-1
    )
    
    stacking_model.fit(X_train_balanced, y_train_balanced)
    stack_val_proba = stacking_model.predict_proba(X_val_eng)[:, 1]
    
    stack_threshold = 0.42  # Conservative threshold
    stack_val_pred = (stack_val_proba >= stack_threshold).astype(int)
    
    models['stacking_ensemble'] = {
        'model': stacking_model,
        'scaler': None,
        'val_predictions': stack_val_pred,
        'val_probabilities': stack_val_proba,
        'threshold': stack_threshold,
        'features': engineered_features,
        'requires_scaling': False
    }
    
    stack_metrics = {
        'recall': recall_score(y_val, stack_val_pred),
        'precision': precision_score(y_val, stack_val_pred),
        'f1': f1_score(y_val, stack_val_pred)
    }
    print(f"Stacking Ensemble - Recall: {stack_metrics['recall']:.4f}, Precision: {stack_metrics['precision']:.4f}, F1: {stack_metrics['f1']:.4f}")
    
    return models

def tiered_threshold_optimization(models, data_splits, target_recall=0.90):
    """
    Implement tiered intervention strategy with multiple thresholds
    """
    print(f"\n{'='*80}")
    print("TIERED INTERVENTION THRESHOLD OPTIMIZATION")
    print(f"{'='*80}")
    
    y_val = data_splits['y_val']
    tiered_results = []
    
    for model_name, model_info in models.items():
        y_proba = model_info['val_probabilities']
        
        print(f"\nOptimizing {model_name.replace('_', ' ').title()}...")
        
        # Define risk tiers
        high_risk_threshold = 0.70   # High precision tier
        medium_risk_threshold = 0.40  # Balanced tier
        low_risk_threshold = 0.20    # High recall tier
        
        # High risk predictions (highest precision)
        high_risk_pred = (y_proba >= high_risk_threshold).astype(int)
        
        # Medium risk predictions  
        medium_risk_pred = (y_proba >= medium_risk_threshold).astype(int)
        
        # Low risk predictions (highest recall)
        low_risk_pred = (y_proba >= low_risk_threshold).astype(int)
        
        # Calculate metrics for each tier
        tiers = {
            'high_risk': {
                'threshold': high_risk_threshold,
                'predictions': high_risk_pred,
                'recall': recall_score(y_val, high_risk_pred),
                'precision': precision_score(y_val, high_risk_pred) if high_risk_pred.sum() > 0 else 0.0,
                'f1': f1_score(y_val, high_risk_pred) if high_risk_pred.sum() > 0 else 0.0,
                'coverage': high_risk_pred.sum() / len(y_val)
            },
            'medium_risk': {
                'threshold': medium_risk_threshold,
                'predictions': medium_risk_pred,
                'recall': recall_score(y_val, medium_risk_pred),
                'precision': precision_score(y_val, medium_risk_pred),
                'f1': f1_score(y_val, medium_risk_pred),
                'coverage': medium_risk_pred.sum() / len(y_val)
            },
            'low_risk': {
                'threshold': low_risk_threshold,
                'predictions': low_risk_pred,
                'recall': recall_score(y_val, low_risk_pred),
                'precision': precision_score(y_val, low_risk_pred),
                'f1': f1_score(y_val, low_risk_pred),
                'coverage': low_risk_pred.sum() / len(y_val)
            }
        }
        
        # Store tiered results
        for tier_name, tier_metrics in tiers.items():
            tiered_results.append({
                'Model': model_name.replace('_', ' ').title(),
                'Tier': tier_name.replace('_', ' ').title(),
                'Threshold': tier_metrics['threshold'],
                'Recall': tier_metrics['recall'],
                'Precision': tier_metrics['precision'],
                'F1': tier_metrics['f1'],
                'Coverage': tier_metrics['coverage'],
                'AUC': roc_auc_score(y_val, y_proba)
            })
        
        # Update model with tiered predictions
        models[model_name]['tiered_predictions'] = tiers
        
        # Display tier results
        print(f"  High Risk Tier (≥{high_risk_threshold}): Precision={tiers['high_risk']['precision']:.4f}, Recall={tiers['high_risk']['recall']:.4f}, Coverage={tiers['high_risk']['coverage']:.2%}")
        print(f"  Medium Risk Tier (≥{medium_risk_threshold}): Precision={tiers['medium_risk']['precision']:.4f}, Recall={tiers['medium_risk']['recall']:.4f}, Coverage={tiers['medium_risk']['coverage']:.2%}")
        print(f"  Low Risk Tier (≥{low_risk_threshold}): Precision={tiers['low_risk']['precision']:.4f}, Recall={tiers['low_risk']['recall']:.4f}, Coverage={tiers['low_risk']['coverage']:.2%}")
    
    # Create results DataFrame
    results_df = pd.DataFrame(tiered_results)
    
    print(f"\n{'='*80}")
    print("TIERED INTERVENTION RESULTS:")
    print(f"{'='*80}")
    print(results_df.round(4).to_string(index=False))
    
    # Analysis
    high_precision_tiers = results_df[(results_df['Tier'] == 'High Risk') & (results_df['Precision'] >= 0.70)]
    high_recall_tiers = results_df[(results_df['Tier'] == 'Low Risk') & (results_df['Recall'] >= 0.90)]
    
    print(f"\n{'='*60}")
    print("TIERED STRATEGY SUMMARY:")
    print(f"{'='*60}")
    print(f"High-precision tiers (≥70% precision): {len(high_precision_tiers)}")
    print(f"High-recall tiers (≥90% recall): {len(high_recall_tiers)}")
    
    if len(high_precision_tiers) > 0:
        best_precision_tier = high_precision_tiers.loc[high_precision_tiers['Precision'].idxmax()]
        print(f"\nBest high-precision tier: {best_precision_tier['Model']} - {best_precision_tier['Tier']}")
        print(f"  Precision: {best_precision_tier['Precision']:.4f}")
        print(f"  Recall: {best_precision_tier['Recall']:.4f}")
        print(f"  Population Coverage: {best_precision_tier['Coverage']:.2%}")
    
    if len(high_recall_tiers) > 0:
        best_recall_tier = high_recall_tiers.loc[high_recall_tiers['Recall'].idxmax()]
        print(f"\nBest high-recall tier: {best_recall_tier['Model']} - {best_recall_tier['Tier']}")
        print(f"  Recall: {best_recall_tier['Recall']:.4f}")
        print(f"  Precision: {best_recall_tier['Precision']:.4f}")
        print(f"  Population Coverage: {best_recall_tier['Coverage']:.2%}")
    
    return results_df

def comprehensive_evaluation(models, data_splits, tiered_results):
    """
    Comprehensive evaluation with focus on achievable targets
    """
    
    print("\n" + "="*80)
    print("COMPREHENSIVE EVALUATION - REALISTIC TARGETS")
    print("="*80)
    
    y_val = data_splits['y_val']
    
    # Standard evaluation
    standard_results = []
    for model_name, model_info in models.items():
        y_pred = model_info['val_predictions']
        y_proba = model_info['val_probabilities']
        
        standard_results.append({
            'Model': model_name.replace('_', ' ').title(),
            'AUC': roc_auc_score(y_val, y_proba),
            'Precision': precision_score(y_val, y_pred),
            'Recall': recall_score(y_val, y_pred),
            'F1': f1_score(y_val, y_pred),
            'Accuracy': accuracy_score(y_val, y_pred),
            'Threshold': model_info['threshold']
        })
    
    standard_df = pd.DataFrame(standard_results)
    
    print("STANDARD MODEL PERFORMANCE:")
    print("=" * 85)
    print(standard_df.round(4).to_string(index=False))
    
    # Best achievable combinations
    best_precision_model = standard_df.loc[standard_df['Precision'].idxmax()]
    best_recall_model = standard_df.loc[standard_df['Recall'].idxmax()]
    best_f1_model = standard_df.loc[standard_df['F1'].idxmax()]
    
    print(f"\nBEST INDIVIDUAL PERFORMANCES:")
    print(f"Highest Precision: {best_precision_model['Model']} - {best_precision_model['Precision']:.4f}")
    print(f"Highest Recall: {best_recall_model['Model']} - {best_recall_model['Recall']:.4f}")
    print(f"Highest F1: {best_f1_model['Model']} - {best_f1_model['F1']:.4f}")
    
    # Realistic targets analysis
    high_recall_models = standard_df[standard_df['Recall'] >= 0.85]
    decent_precision_models = standard_df[standard_df['Precision'] >= 0.65]
    balanced_models = standard_df[(standard_df['Recall'] >= 0.85) & (standard_df['Precision'] >= 0.65)]
    
    print(f"\nREALISTIC PERFORMANCE ANALYSIS:")
    print(f"Models with ≥85% recall: {len(high_recall_models)}/{len(standard_df)}")
    print(f"Models with ≥65% precision: {len(decent_precision_models)}/{len(standard_df)}")
    print(f"Models achieving BOTH (≥85% recall & ≥65% precision): {len(balanced_models)}/{len(standard_df)}")
    
    if len(balanced_models) > 0:
        print(f"\nACHIEVABLE BALANCED MODELS:")
        for _, model in balanced_models.iterrows():
            print(f"  {model['Model']}: Recall={model['Recall']:.4f}, Precision={model['Precision']:.4f}, F1={model['F1']:.4f}")
    
    return standard_df

# Main execution function
def main():
    """
    Execute the comprehensive precision-recall optimization pipeline
    """
    
    dataset_path = r"C:\Users\USER\Desktop\MUKABUGINGO_THESIS_CODES\ANALYSIS\rwanda_dhs_processed.csv"
    
    try:
        # Load dataset
        df = pd.read_csv(dataset_path)
        print(f"Dataset loaded successfully: {df.shape}")
        
        target_col = 'early_sexual_debut'
        if target_col not in df.columns:
            raise ValueError(f"Target variable '{target_col}' not found in dataset!")
        
        # Phase 3.1: Enhanced data preparation
        print("\nStarting Phase 3.1: Enhanced Data Preparation...")
        data_splits = prepare_enhanced_data(df, target_col=target_col)
        
        # Phase 3.2: Precision-optimized model implementation
        print("\nStarting Phase 3.2: Precision-Optimized Model Implementation...")
        models = implement_precision_optimized_models(data_splits)
        
        # Phase 3.3: Tiered threshold optimization
        print("\nStarting Phase 3.3: Tiered Intervention Strategy...")
        tiered_results = tiered_threshold_optimization(models, data_splits, target_recall=0.90)
        
        # Phase 3.4: Comprehensive evaluation
        print("\nStarting Phase 3.4: Comprehensive Evaluation...")
        standard_results = comprehensive_evaluation(models, data_splits, tiered_results)
        
        # FINAL SUMMARY
        print("\n" + "="*80)
        print("PRECISION-RECALL OPTIMIZATION COMPLETE!")
        print("="*80)
        
        # Find best achievable performance
        best_balanced = standard_results.loc[
            (standard_results['Recall'] >= 0.85) & 
            (standard_results['Precision'] >= 0.65)
        ]
        
        if len(best_balanced) > 0:
            optimal_model = best_balanced.loc[best_balanced['F1'].idxmax()]
            print(f"\nOPTIMAL ACHIEVABLE PERFORMANCE:")
            print(f"Model: {optimal_model['Model']}")
            print(f"  Recall: {optimal_model['Recall']:.4f} ({optimal_model['Recall']*100:.1f}%)")
            print(f"  Precision: {optimal_model['Precision']:.4f} ({optimal_model['Precision']*100:.1f}%)")
            print(f"  F1 Score: {optimal_model['F1']:.4f}")
            print(f"  AUC: {optimal_model['AUC']:.4f}")
            print(f"  Cases missed: {(1-optimal_model['Recall'])*100:.1f}%")
            print(f"  False positive rate: {(1-optimal_model['Precision'])*100:.1f}%")
        else:
            print(f"\nRECOMMENDED APPROACH: Use tiered intervention strategy")
            print("Different thresholds for different intervention intensities")
        
        # Tiered strategy recommendations
        high_precision_tiers = tiered_results[(tiered_results['Tier'] == 'High Risk') & 
                                            (tiered_results['Precision'] >= 0.70)]
        
        if len(high_precision_tiers) > 0:
            best_precision_tier = high_precision_tiers.loc[high_precision_tiers['Precision'].idxmax()]
            print(f"\nTIERED STRATEGY RECOMMENDATION:")
            print(f"High-Precision Tier: {best_precision_tier['Model']} at threshold {best_precision_tier['Threshold']}")
            print(f"  Precision: {best_precision_tier['Precision']:.4f}")
            print(f"  Recall: {best_precision_tier['Recall']:.4f}")
            print(f"  Population Coverage: {best_precision_tier['Coverage']:.2%}")
            print(f"  Intervention: Intensive individual support")
        
        print(f"\nREALISTIC TARGET ACHIEVEMENT:")
        recall_85_count = len(standard_results[standard_results['Recall'] >= 0.85])
        precision_68_count = len(standard_results[standard_results['Precision'] >= 0.68])
        print(f"  Models achieving ≥85% recall: {recall_85_count}/{len(standard_results)}")
        print(f"  Models achieving ≥68% precision: {precision_68_count}/{len(standard_results)}")
        
        print(f"\nDEPLOYMENT READINESS:")
        print("  Advanced feature engineering completed")
        print("  Conservative sampling strategy applied") 
        print("  Tiered intervention thresholds optimized")
        print("  Realistic performance targets identified")
        print("  Ready for pilot implementation")
        
        return data_splits, models, standard_results, tiered_results
        
    except FileNotFoundError:
        print("ERROR: Dataset not found at specified path")
        return None, None, None, None
        
    except Exception as e:
        print(f"ERROR in precision-recall optimization: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None, None

# Execute the pipeline
if __name__ == "__main__":
    data_splits, models, standard_results, tiered_results = main()

Dataset loaded successfully: (14634, 66)

Starting Phase 3.1: Enhanced Data Preparation...
PHASE 3.1: ENHANCED DATA PREPARATION WITH FEATURE ENGINEERING
Dataset after removing missing targets: (14634, 66)

COLUMNS EXCLUDED FROM FEATURES:
  ID variables: ['caseid', 'household_id', 'v001', 'v002']
  Leakage variables: ['v525', 'v512', 'v511', 'v212']
  Target variable: ['early_sexual_debut']
  Total excluded: 9
✓ Confirmed: All problematic variables excluded from features
✓ Total features available: 61

TARGET VARIABLE DISTRIBUTION:
  Late debut (0): 7,919 (54.1%)
  Early debut (1): 6,715 (45.9%)

DATA SPLIT SUMMARY:
Train: 10243, Val: 2195, Test: 2196

Starting Phase 3.2: Precision-Optimized Model Implementation...

PHASE 3.2: PRECISION-OPTIMIZED MODEL IMPLEMENTATION

ADVANCED FEATURE ENGINEERING
--------------------------------------------------
Step 1: Removing highly correlated features...
  Removing 22 highly correlated features
Step 2: Advanced feature selection...
  Selected top 2