In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report
from scipy import stats
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.mediation import Mediation
import warnings
warnings.filterwarnings('ignore')

class EducationalProtectionAnalysis:
    """
    Advanced analysis to demonstrate H2: Educational Protection hypothesis
    through mediation analysis, pathway decomposition, and controlled modeling
    """
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = None
        self.results = {}
        
    def load_and_prepare_data(self):
        """Load data and create educational protection variables"""
        print("=" * 80)
        print("H2: EDUCATIONAL PROTECTION HYPOTHESIS - ADVANCED ANALYSIS")
        print("=" * 80)
        
        self.data = pd.read_csv(self.data_path)
        print(f"Dataset loaded: {self.data.shape[0]:,} rows, {self.data.shape[1]} columns")
        
        # Create refined education variables for protection analysis
        self.data['education_primary_plus'] = (self.data['v106'] >= 1).astype(int)
        self.data['education_secondary_plus'] = (self.data['v106'] >= 2).astype(int) 
        self.data['education_years_scaled'] = self.data['v107'] / self.data['v107'].max()
        self.data['literacy_full'] = (self.data['v150'] == 3).astype(int)  # Can read whole sentence
        
        # Create wealth controls
        self.data['wealth_high'] = (self.data['v190'] >= 4).astype(int)  # Richer + Richest
        self.data['wealth_medium'] = (self.data['v190'] == 3).astype(int)  # Middle
        
        # Create urban/regional controls  
        self.data['urban'] = (self.data['v102'] == 1).astype(int)
        self.data['kigali'] = (self.data['v101'] == 1).astype(int)
        
        print("Educational protection variables created successfully")
        return self.data
    
    def educational_gradient_analysis(self):
        """Analyze educational gradient in early sexual debut"""
        print(f"\nSTEP 1: EDUCATIONAL GRADIENT ANALYSIS")
        print("-" * 50)
        
        # Create educational levels
        education_levels = {
            'No Education': self.data['v106'] == 0,
            'Primary': self.data['v106'] == 1, 
            'Secondary': self.data['v106'] == 2,
            'Higher': self.data['v106'] == 3
        }
        
        gradient_results = {}
        print("EARLY SEXUAL DEBUT BY EDUCATION LEVEL:")
        print("-" * 40)
        
        for level, mask in education_levels.items():
            if mask.sum() > 100:  # Sufficient sample size
                debut_rate = self.data[mask]['early_sexual_debut'].mean()
                n_sample = mask.sum()
                
                # Calculate confidence interval
                se = np.sqrt(debut_rate * (1 - debut_rate) / n_sample)
                ci_lower = debut_rate - 1.96 * se
                ci_upper = debut_rate + 1.96 * se
                
                gradient_results[level] = {
                    'debut_rate': debut_rate,
                    'sample_size': n_sample,
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper
                }
                
                print(f"{level:12s}: {debut_rate:.3f} ({debut_rate*100:.1f}%) "
                      f"[95% CI: {ci_lower:.3f}-{ci_upper:.3f}] (n={n_sample:,})")
        
        # Calculate protection gradient
        if 'No Education' in gradient_results and 'Higher' in gradient_results:
            protection_effect = (gradient_results['No Education']['debut_rate'] - 
                               gradient_results['Higher']['debut_rate'])
            print(f"\nEducational Protection Gradient: {protection_effect:.3f} ({protection_effect*100:.1f} percentage points)")
        
        self.results['gradient'] = gradient_results
        return gradient_results
    
    def mediation_analysis(self):
        """Perform mediation analysis: Education -> Wealth -> Early Sexual Debut"""
        print(f"\nSTEP 2: MEDIATION ANALYSIS - EDUCATION THROUGH WEALTH")
        print("-" * 60)
        
        # Prepare variables for mediation
        # X = Education (predictor)
        # M = Wealth (mediator) 
        # Y = Early Sexual Debut (outcome)
        
        analysis_data = self.data[['v106', 'v190', 'early_sexual_debut']].dropna()
        
        if len(analysis_data) < 1000:
            print("Insufficient data for mediation analysis")
            return None
            
        X = analysis_data['v106']  # Education level
        M = analysis_data['v190']  # Wealth quintile
        Y = analysis_data['early_sexual_debut']  # Early debut
        
        print(f"Mediation analysis sample: {len(analysis_data):,} observations")
        
        # Path analysis using correlation-based approach
        # Path a: X -> M (Education -> Wealth)
        corr_XM, p_XM = stats.pearsonr(X, M)
        
        # Path b: M -> Y controlling for X (Wealth -> Debut controlling for Education)
        # Use partial correlation
        corr_XY = stats.pearsonr(X, Y)[0]
        corr_MY = stats.pearsonr(M, Y)[0]
        
        # Partial correlation: M-Y controlling for X
        partial_MY_X = (corr_MY - corr_XY * corr_XM) / np.sqrt((1 - corr_XY**2) * (1 - corr_XM**2))
        
        # Path c: X -> Y (Total effect)
        corr_XY_total = corr_XY
        
        # Path c': X -> Y controlling for M (Direct effect)
        partial_XY_M = (corr_XY - corr_MY * corr_XM) / np.sqrt((1 - corr_MY**2) * (1 - corr_XM**2))
        
        # Indirect effect (mediation)
        indirect_effect = corr_XM * partial_MY_X
        
        # Proportion mediated
        if abs(corr_XY_total) > 0.001:
            prop_mediated = indirect_effect / corr_XY_total
        else:
            prop_mediated = 0
        
        mediation_results = {
            'path_a_education_to_wealth': corr_XM,
            'path_b_wealth_to_debut': partial_MY_X,
            'path_c_total_effect': corr_XY_total,
            'path_c_prime_direct_effect': partial_XY_M,
            'indirect_effect': indirect_effect,
            'proportion_mediated': prop_mediated,
            'direct_effect_significant': abs(partial_XY_M) > 0.05
        }
        
        print("MEDIATION ANALYSIS RESULTS:")
        print("-" * 30)
        print(f"Path a (Education → Wealth):     {corr_XM:+.4f}")
        print(f"Path b (Wealth → Debut | Edu):   {partial_MY_X:+.4f}")
        print(f"Path c (Total Effect):           {corr_XY_total:+.4f}")
        print(f"Path c' (Direct Effect):         {partial_XY_M:+.4f}")
        print(f"Indirect Effect (Mediation):     {indirect_effect:+.4f}")
        print(f"Proportion Mediated:             {prop_mediated:.2%}")
        
        # Interpretation
        if abs(partial_XY_M) > 0.05:
            print(f"\n✓ DIRECT EDUCATIONAL PROTECTION CONFIRMED")
            print(f"  Education has protective effect independent of wealth")
        else:
            print(f"\n⚠ Educational effect mainly mediated through wealth")
        
        self.results['mediation'] = mediation_results
        return mediation_results
    
    def controlled_modeling_analysis(self):
        """Model education effects controlling for socioeconomic factors"""
        print(f"\nSTEP 3: CONTROLLED MODELING - EDUCATION INDEPENDENT OF SES")
        print("-" * 65)
        
        # Prepare features
        education_features = ['v106', 'v107', 'v149', 'v150']
        wealth_controls = ['v190', 'v191', 'hv270', 'hv271'] 
        demographic_controls = ['v012', 'v101', 'v102', 'hv009']
        
        # Available features
        all_features = education_features + wealth_controls + demographic_controls
        available_features = [f for f in all_features if f in self.data.columns]
        
        # Prepare analysis data
        analysis_data = self.data[available_features + ['early_sexual_debut']].dropna()
        
        if len(analysis_data) < 1000:
            print("Insufficient data for controlled modeling")
            return None
            
        print(f"Analysis sample: {len(analysis_data):,} observations")
        
        X = analysis_data[available_features]
        y = analysis_data['early_sexual_debut']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42, stratify=y
        )
        
        controlled_results = {}
        
        # Model 1: Education features only
        education_only = [f for f in education_features if f in available_features]
        if len(education_only) >= 2:
            rf_edu = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
            rf_edu.fit(X_train[education_only], y_train)
            auc_edu_only = roc_auc_score(y_test, rf_edu.predict_proba(X_test[education_only])[:, 1])
            
            controlled_results['education_only'] = {
                'auc': auc_edu_only,
                'features': education_only,
                'feature_importance': dict(zip(education_only, rf_edu.feature_importances_))
            }
        
        # Model 2: Wealth controls only
        wealth_only = [f for f in wealth_controls if f in available_features]
        if len(wealth_only) >= 2:
            rf_wealth = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
            rf_wealth.fit(X_train[wealth_only], y_train)
            auc_wealth_only = roc_auc_score(y_test, rf_wealth.predict_proba(X_test[wealth_only])[:, 1])
            
            controlled_results['wealth_only'] = {
                'auc': auc_wealth_only,
                'features': wealth_only
            }
        
        # Model 3: Education + Wealth controls  
        edu_wealth = education_only + wealth_only
        rf_combined = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
        rf_combined.fit(X_train[edu_wealth], y_train)
        auc_combined = roc_auc_score(y_test, rf_combined.predict_proba(X_test[edu_wealth])[:, 1])
        
        # Feature importance in combined model
        combined_importance = dict(zip(edu_wealth, rf_combined.feature_importances_))
        
        controlled_results['combined'] = {
            'auc': auc_combined,
            'features': edu_wealth,
            'feature_importance': combined_importance
        }
        
        # Model 4: Full model with all controls
        rf_full = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
        rf_full.fit(X_train, y_train)
        auc_full = roc_auc_score(y_test, rf_full.predict_proba(X_test)[:, 1])
        
        full_importance = dict(zip(available_features, rf_full.feature_importances_))
        
        controlled_results['full_model'] = {
            'auc': auc_full,
            'features': available_features,
            'feature_importance': full_importance
        }
        
        # Results summary
        print("CONTROLLED MODELING RESULTS:")
        print("-" * 35)
        for model_name, results in controlled_results.items():
            print(f"{model_name:15s}: AUC = {results['auc']:.4f}")
        
        # Education contribution analysis
        if 'education_only' in controlled_results and 'wealth_only' in controlled_results:
            edu_contribution = controlled_results['education_only']['auc'] - 0.5
            wealth_contribution = controlled_results['wealth_only']['auc'] - 0.5
            
            print(f"\nCONTRIBUTION ANALYSIS:")
            print(f"Education alone contribution: {edu_contribution:.4f}")
            print(f"Wealth alone contribution:    {wealth_contribution:.4f}")
            
            if edu_contribution > 0.05:  # Meaningful threshold
                print(f"✓ EDUCATION HAS SUBSTANTIAL INDEPENDENT PREDICTIVE POWER")
            
        # Feature importance in full model
        print(f"\nTOP EDUCATION FEATURES IN FULL MODEL:")
        edu_importance = {k: v for k, v in full_importance.items() if k in education_features}
        for feature, importance in sorted(edu_importance.items(), key=lambda x: x[1], reverse=True):
            print(f"  {feature}: {importance:.4f}")
        
        self.results['controlled_modeling'] = controlled_results
        return controlled_results
    
    def stratified_protection_analysis(self):
        """Analyze educational protection within wealth strata"""
        print(f"\nSTEP 4: STRATIFIED ANALYSIS - EDUCATION WITHIN WEALTH LEVELS")
        print("-" * 65)
        
        # Create wealth strata
        wealth_strata = {
            'Low Wealth (Q1-Q2)': self.data['v190'] <= 2,
            'Medium Wealth (Q3)': self.data['v190'] == 3,
            'High Wealth (Q4-Q5)': self.data['v190'] >= 4
        }
        
        stratified_results = {}
        
        for stratum_name, stratum_mask in wealth_strata.items():
            stratum_data = self.data[stratum_mask]
            
            if len(stratum_data) < 500:
                continue
                
            print(f"\n{stratum_name} (n={len(stratum_data):,}):")
            print("-" * (len(stratum_name) + 15))
            
            # Education gradient within this wealth stratum
            education_protection = {}
            
            for edu_level in [0, 1, 2, 3]:  # No, Primary, Secondary, Higher
                edu_mask = stratum_data['v106'] == edu_level
                
                if edu_mask.sum() > 50:  # Minimum sample size
                    debut_rate = stratum_data[edu_mask]['early_sexual_debut'].mean()
                    n_sample = edu_mask.sum()
                    
                    education_protection[edu_level] = {
                        'debut_rate': debut_rate,
                        'sample_size': n_sample
                    }
                    
                    edu_labels = {0: 'No Edu', 1: 'Primary', 2: 'Secondary', 3: 'Higher'}
                    print(f"  {edu_labels[edu_level]:9s}: {debut_rate:.3f} ({debut_rate*100:.1f}%) (n={n_sample:,})")
            
            # Calculate protection within stratum
            if 0 in education_protection and max(education_protection.keys()) >= 2:
                no_edu_rate = education_protection[0]['debut_rate']
                highest_edu_level = max(education_protection.keys())
                high_edu_rate = education_protection[highest_edu_level]['debut_rate']
                
                protection_within_stratum = no_edu_rate - high_edu_rate
                print(f"  Protection within stratum: {protection_within_stratum:+.3f} ({protection_within_stratum*100:+.1f}pp)")
                
                stratified_results[stratum_name] = {
                    'education_levels': education_protection,
                    'protection_effect': protection_within_stratum,
                    'protection_significant': abs(protection_within_stratum) > 0.03
                }
        
        # Summary of stratified effects
        print(f"\nSTRATIFIED PROTECTION SUMMARY:")
        print("-" * 35)
        
        significant_strata = 0
        for stratum, results in stratified_results.items():
            if results['protection_significant']:
                significant_strata += 1
                print(f"✓ {stratum}: {results['protection_effect']:+.3f} (SIGNIFICANT)")
            else:
                print(f"○ {stratum}: {results['protection_effect']:+.3f} (weak)")
        
        if significant_strata >= 2:
            print(f"\n✓ EDUCATIONAL PROTECTION CONFIRMED ACROSS WEALTH STRATA")
            print(f"  Education shows independent protective effects in {significant_strata} wealth levels")
        
        self.results['stratified'] = stratified_results
        return stratified_results
    
    def education_dose_response(self):
        """Analyze dose-response relationship for education"""
        print(f"\nSTEP 5: DOSE-RESPONSE ANALYSIS - YEARS OF EDUCATION")
        print("-" * 60)
        
        if 'v107' not in self.data.columns:
            print("Years of education variable not available")
            return None
        
        # Create education year bins
        education_bins = [
            (0, 0, 'No Education'),
            (1, 6, 'Primary (1-6 years)'),
            (7, 12, 'Secondary (7-12 years)'), 
            (13, 20, 'Higher (13+ years)')
        ]
        
        dose_results = {}
        
        print("DOSE-RESPONSE: Early Sexual Debut by Years of Education")
        print("-" * 55)
        
        for min_years, max_years, label in education_bins:
            mask = (self.data['v107'] >= min_years) & (self.data['v107'] <= max_years)
            
            if mask.sum() > 100:
                debut_rate = self.data[mask]['early_sexual_debut'].mean()
                n_sample = mask.sum()
                
                dose_results[label] = {
                    'debut_rate': debut_rate,
                    'sample_size': n_sample,
                    'years_range': (min_years, max_years)
                }
                
                print(f"{label:25s}: {debut_rate:.3f} ({debut_rate*100:.1f}%) (n={n_sample:,})")
        
        # Linear trend analysis
        valid_data = self.data[['v107', 'early_sexual_debut']].dropna()
        
        if len(valid_data) > 1000:
            correlation, p_value = stats.pearsonr(valid_data['v107'], valid_data['early_sexual_debut'])
            
            print(f"\nDOSE-RESPONSE CORRELATION:")
            print(f"Years Education ↔ Early Debut: r = {correlation:+.4f}, p = {p_value:.6f}")
            
            if p_value < 0.05 and correlation < -0.05:
                print(f"✓ SIGNIFICANT DOSE-RESPONSE PROTECTION CONFIRMED")
                print(f"  Each additional year of education reduces early debut risk")
            
            dose_results['correlation'] = {
                'correlation': correlation,
                'p_value': p_value,
                'sample_size': len(valid_data)
            }
        
        self.results['dose_response'] = dose_results
        return dose_results
    
    def generate_h2_assessment(self):
        """Generate final assessment of H2: Educational Protection"""
        print(f"\n" + "=" * 80)
        print("H2: EDUCATIONAL PROTECTION HYPOTHESIS - FINAL ASSESSMENT")
        print("=" * 80)
        
        evidence_summary = {
            'gradient_evidence': False,
            'direct_effect_evidence': False, 
            'independent_prediction_evidence': False,
            'stratified_evidence': False,
            'dose_response_evidence': False
        }
        
        # Evaluate each type of evidence
        
        # 1. Educational gradient
        if 'gradient' in self.results:
            gradient_data = self.results['gradient']
            if len(gradient_data) >= 3:  # Multiple education levels
                rates = [data['debut_rate'] for data in gradient_data.values()]
                if max(rates) - min(rates) > 0.05:  # 5% gradient
                    evidence_summary['gradient_evidence'] = True
        
        # 2. Direct effect from mediation
        if 'mediation' in self.results:
            if self.results['mediation'].get('direct_effect_significant', False):
                evidence_summary['direct_effect_evidence'] = True
        
        # 3. Independent predictive power
        if 'controlled_modeling' in self.results:
            results = self.results['controlled_modeling']
            if 'education_only' in results and results['education_only']['auc'] > 0.55:
                evidence_summary['independent_prediction_evidence'] = True
        
        # 4. Stratified effects
        if 'stratified' in self.results:
            significant_strata = sum(1 for r in self.results['stratified'].values() 
                                   if r.get('protection_significant', False))
            if significant_strata >= 2:
                evidence_summary['stratified_evidence'] = True
        
        # 5. Dose-response
        if 'dose_response' in self.results:
            dose_data = self.results['dose_response']
            if 'correlation' in dose_data:
                corr_data = dose_data['correlation']
                if corr_data['p_value'] < 0.05 and corr_data['correlation'] < -0.05:
                    evidence_summary['dose_response_evidence'] = True
        
        # Count evidence types
        evidence_count = sum(evidence_summary.values())
        total_possible = len(evidence_summary)
        
        print("EVIDENCE SUMMARY:")
        print("-" * 20)
        for evidence_type, found in evidence_summary.items():
            status = "✓ CONFIRMED" if found else "○ Limited"
            print(f"{evidence_type.replace('_', ' ').title():30s}: {status}")
        
        print(f"\nOVERALL EVIDENCE STRENGTH: {evidence_count}/{total_possible} ({evidence_count/total_possible:.1%})")
        
        # Final determination
        if evidence_count >= 4:
            h2_status = "FULLY ACHIEVED"
            explanation = "Multiple independent lines of evidence confirm educational protection"
        elif evidence_count >= 3:
            h2_status = "SUBSTANTIALLY ACHIEVED"  
            explanation = "Strong evidence for educational protection with minor limitations"
        elif evidence_count >= 2:
            h2_status = "MODERATELY ACHIEVED"
            explanation = "Some evidence for educational protection, but effects may be mediated"
        else:
            h2_status = "PARTIALLY ACHIEVED"
            explanation = "Limited evidence for independent educational protection"
        
        print(f"\nH2 HYPOTHESIS STATUS: {h2_status}")
        print(f"Explanation: {explanation}")
        
        # Specific recommendations
        if evidence_count >= 3:
            print(f"\nCONCLUSION:")
            print("Educational protection hypothesis is supported by multiple analytical approaches.")
            print("Education shows protective effects beyond socioeconomic mediation pathways.")
        
        return h2_status, evidence_summary

    def run_complete_h2_analysis(self):
        """Execute complete H2 analysis pipeline"""
        try:
            # Load data
            self.load_and_prepare_data()
            
            # Run all analyses
            self.educational_gradient_analysis()
            self.mediation_analysis() 
            self.controlled_modeling_analysis()
            self.stratified_protection_analysis()
            self.education_dose_response()
            
            # Final assessment
            h2_status, evidence = self.generate_h2_assessment()
            
            return {
                'status': h2_status,
                'evidence_summary': evidence,
                'detailed_results': self.results
            }
            
        except Exception as e:
            print(f"Error in H2 analysis: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Main execution
def main():
    data_path =r"C:\Users\USER\Desktop\MUKABUGINGO_THESIS_CODES\ANALYSIS\rwanda_dhs_processed.csv"
    
    print("EXECUTING H2: EDUCATIONAL PROTECTION HYPOTHESIS ANALYSIS")
    print("Advanced mediation and stratified analysis to demonstrate independent effects")
    
    analyzer = EducationalProtectionAnalysis(data_path)
    results = analyzer.run_complete_h2_analysis()
    
    if results:
        print(f"\nANALYSIS COMPLETE!")
        print(f"H2 Status: {results['status']}")
        
        if results['status'] in ['FULLY ACHIEVED', 'SUBSTANTIALLY ACHIEVED']:
            print("Educational protection hypothesis successfully demonstrated!")
        
        return results
    else:
        print("Analysis failed - please check data and parameters")
        return None

# Execute
if __name__ == "__main__":
    results = main()

EXECUTING H2: EDUCATIONAL PROTECTION HYPOTHESIS ANALYSIS
Advanced mediation and stratified analysis to demonstrate independent effects
H2: EDUCATIONAL PROTECTION HYPOTHESIS - ADVANCED ANALYSIS
Dataset loaded: 14,634 rows, 66 columns
Educational protection variables created successfully

STEP 1: EDUCATIONAL GRADIENT ANALYSIS
--------------------------------------------------
EARLY SEXUAL DEBUT BY EDUCATION LEVEL:
----------------------------------------
No Education: 0.357 (35.7%) [95% CI: 0.332-0.383] (n=1,352)
Primary     : 0.413 (41.3%) [95% CI: 0.402-0.423] (n=8,500)
Secondary   : 0.621 (62.1%) [95% CI: 0.606-0.636] (n=4,110)
Higher      : 0.254 (25.4%) [95% CI: 0.222-0.287] (n=672)

Educational Protection Gradient: 0.103 (10.3 percentage points)

STEP 2: MEDIATION ANALYSIS - EDUCATION THROUGH WEALTH
------------------------------------------------------------
Mediation analysis sample: 14,634 observations
MEDIATION ANALYSIS RESULTS:
------------------------------
Path a (Education 