In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

class ComplexInteractionsAnalysis:
    """
    H5: Complex Interactions Hypothesis Testing
    Tests whether non-linear tree-based ensemble methods outperform linear models
    by capturing complex feature interactions in early sexual debut prediction
    """
    
    def __init__(self, data_path):
        self.data_path = data_path
        self.data = None
        self.results = {}
        
    def load_and_prepare_data(self):
        """Load data and prepare for interaction analysis"""
        print("=" * 80)
        print("H5: COMPLEX INTERACTIONS HYPOTHESIS - TESTING LINEAR VS NON-LINEAR")
        print("=" * 80)
        
        self.data = pd.read_csv(self.data_path)
        print(f"Dataset loaded: {self.data.shape[0]:,} rows, {self.data.shape[1]} columns")
        
        # Prepare clean feature set (avoid leakage)
        safe_features = [
            # Demographics
            'v012', 'v013', 'v101', 'v102', 'hv009',
            # Education (for interactions)
            'v106', 'v107', 'v149', 'v150',
            # Socioeconomic
            'v190', 'v191', 'hv270', 'hv271', 'v130',
            # Health knowledge
            'v157', 'v158', 'v384a', 'v384b',
            # Assets
            'hv206', 'hv207', 'hv208', 'v714',
            # Engineered features
            'has_education', 'has_secondary_plus', 'total_assets',
            'age_education_interaction', 'age_wealth_interaction', 'education_wealth_interaction'
        ]
        
        # Select available features
        self.features = [f for f in safe_features if f in self.data.columns]
        
        # Prepare analysis dataset
        analysis_data = self.data[self.features + ['early_sexual_debut']].dropna()
        
        self.X = analysis_data[self.features]
        self.y = analysis_data['early_sexual_debut'].astype(int)
        
        print(f"Analysis dataset: {len(self.X):,} observations, {len(self.features)} features")
        print(f"Target distribution: {self.y.value_counts().to_dict()}")
        
        return self.X, self.y
    
    def detect_potential_interactions(self):
        """Detect potential feature interactions using mutual information"""
        print(f"\nSTEP 1: DETECTING POTENTIAL FEATURE INTERACTIONS")
        print("-" * 55)
        
        # Calculate mutual information between features and target
        mi_scores = mutual_info_classif(self.X, self.y, random_state=42)
        
        # Create feature importance ranking
        feature_importance = pd.DataFrame({
            'feature': self.features,
            'mutual_info': mi_scores
        }).sort_values('mutual_info', ascending=False)
        
        print("TOP 10 FEATURES BY MUTUAL INFORMATION:")
        for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
            print(f"  {i+1:2d}. {row['feature']:25s}: {row['mutual_info']:.4f}")
        
        # Select top features for interaction analysis
        top_features = feature_importance.head(8)['feature'].tolist()
        
        # Analyze pairwise correlations among top features
        top_feature_data = self.X[top_features]
        correlation_matrix = top_feature_data.corr().abs()
        
        # Find potential interactions (moderate correlation 0.3-0.7)
        potential_interactions = []
        for i in range(len(top_features)):
            for j in range(i+1, len(top_features)):
                corr = correlation_matrix.iloc[i, j]
                if 0.1 < corr < 0.8:  # Not too correlated, not independent
                    potential_interactions.append({
                        'feature1': top_features[i],
                        'feature2': top_features[j],
                        'correlation': corr
                    })
        
        # Sort by correlation strength
        potential_interactions = sorted(potential_interactions, 
                                      key=lambda x: x['correlation'], reverse=True)
        
        print(f"\nTOP 5 POTENTIAL INTERACTIONS:")
        for i, interaction in enumerate(potential_interactions[:5]):
            print(f"  {i+1}. {interaction['feature1']} × {interaction['feature2']}: "
                  f"r = {interaction['correlation']:.3f}")
        
        self.results['interactions'] = potential_interactions[:10]
        self.results['top_features'] = top_features
        
        return potential_interactions, top_features
    
    def create_interaction_features(self, top_features):
        """Create explicit interaction features for linear models"""
        print(f"\nSTEP 2: CREATING EXPLICIT INTERACTION FEATURES")
        print("-" * 50)
        
        # Start with original features
        X_with_interactions = self.X.copy()
        interaction_names = []
        
        # Create polynomial features for top 5 features
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        
        top_5_features = top_features[:5]
        poly_features = poly.fit_transform(self.X[top_5_features])
        poly_names = poly.get_feature_names_out(top_5_features)
        
        # Add interaction terms (not original features)
        for name, feature in zip(poly_names, poly_features.T):
            if ' ' in name:  # Interaction term
                interaction_names.append(name)
                X_with_interactions[name] = feature
        
        print(f"Created {len(interaction_names)} polynomial interaction features")
        
        # Create domain-specific interactions
        domain_interactions = []
        
        # Age-Education interactions (different from existing ones)
        if 'v012' in self.X.columns and 'v106' in self.X.columns:
            X_with_interactions['age_edu_squared'] = (self.X['v012'] * self.X['v106']) ** 2
            domain_interactions.append('age_edu_squared')
        
        # Wealth-Urban interactions
        if 'v190' in self.X.columns and 'v102' in self.X.columns:
            X_with_interactions['wealth_urban_interact'] = self.X['v190'] * (self.X['v102'] == 1)
            domain_interactions.append('wealth_urban_interact')
        
        # Education-Knowledge interactions
        if 'v106' in self.X.columns and 'v157' in self.X.columns:
            X_with_interactions['edu_knowledge_interact'] = self.X['v106'] * self.X['v157']
            domain_interactions.append('edu_knowledge_interact')
        
        print(f"Created {len(domain_interactions)} domain-specific interactions")
        
        total_interactions = interaction_names + domain_interactions
        print(f"Total interaction features: {len(total_interactions)}")
        
        self.results['interaction_features'] = total_interactions
        
        return X_with_interactions, total_interactions
    
    def compare_linear_vs_nonlinear_models(self):
        """Compare linear vs non-linear model performance"""
        print(f"\nSTEP 3: LINEAR VS NON-LINEAR MODEL COMPARISON")
        print("-" * 55)
        
        # Get interaction features
        X_interactions, interaction_names = self.create_interaction_features(
            self.results['top_features']
        )
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_interactions, self.y, test_size=0.25, random_state=42, stratify=self.y
        )
        
        # Scale features for linear models
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        models_results = {}
        
        print("TRAINING AND EVALUATING MODELS:")
        print("-" * 40)
        
        # ===== LINEAR MODELS =====
        
        # 1. Simple Logistic Regression (baseline linear)
        print("1. Simple Logistic Regression (original features only)...")
        lr_simple = LogisticRegression(random_state=42, max_iter=2000, class_weight='balanced')
        lr_simple.fit(X_train[self.features], y_train)
        lr_simple_pred = lr_simple.predict_proba(X_test[self.features])[:, 1]
        lr_simple_auc = roc_auc_score(y_test, lr_simple_pred)
        
        models_results['Linear_Simple'] = {
            'model': lr_simple,
            'auc': lr_simple_auc,
            'type': 'Linear',
            'complexity': 'Low',
            'features_used': len(self.features)
        }
        
        # 2. Logistic Regression with Interactions
        print("2. Logistic Regression with interaction features...")
        lr_interactions = LogisticRegression(random_state=42, max_iter=2000, 
                                           class_weight='balanced', C=0.1)  # Regularized
        lr_interactions.fit(X_train_scaled, y_train)
        lr_interactions_pred = lr_interactions.predict_proba(X_test_scaled)[:, 1]
        lr_interactions_auc = roc_auc_score(y_test, lr_interactions_pred)
        
        models_results['Linear_Interactions'] = {
            'model': lr_interactions,
            'auc': lr_interactions_auc,
            'type': 'Linear',
            'complexity': 'Medium',
            'features_used': X_train.shape[1]
        }
        
        # 3. Ridge Classifier (regularized linear)
        print("3. Ridge Classifier (regularized linear)...")
        ridge = RidgeClassifier(alpha=1.0, class_weight='balanced', random_state=42)
        ridge.fit(X_train_scaled, y_train)
        ridge_pred = ridge.decision_function(X_test_scaled)
        ridge_auc = roc_auc_score(y_test, ridge_pred)
        
        models_results['Ridge_Linear'] = {
            'model': ridge,
            'auc': ridge_auc,
            'type': 'Linear',
            'complexity': 'Medium',
            'features_used': X_train.shape[1]
        }
        
        # ===== NON-LINEAR MODELS =====
        
        # 4. Random Forest (tree-based ensemble)
        print("4. Random Forest (tree-based ensemble)...")
        rf = RandomForestClassifier(
            n_estimators=300,
            max_depth=15,
            min_samples_split=10,
            min_samples_leaf=5,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )
        rf.fit(X_train[self.features], y_train)  # Use original features only
        rf_pred = rf.predict_proba(X_test[self.features])[:, 1]
        rf_auc = roc_auc_score(y_test, rf_pred)
        
        models_results['Random_Forest'] = {
            'model': rf,
            'auc': rf_auc,
            'type': 'Non-Linear',
            'complexity': 'High',
            'features_used': len(self.features)
        }
        
        # 5. Gradient Boosting (tree-based ensemble)
        print("5. Gradient Boosting (tree-based ensemble)...")
        gb = GradientBoostingClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            min_samples_split=20,
            min_samples_leaf=10,
            subsample=0.8,
            random_state=42
        )
        gb.fit(X_train[self.features], y_train)
        gb_pred = gb.predict_proba(X_test[self.features])[:, 1]
        gb_auc = roc_auc_score(y_test, gb_pred)
        
        models_results['Gradient_Boosting'] = {
            'model': gb,
            'auc': gb_auc,
            'type': 'Non-Linear',
            'complexity': 'High',
            'features_used': len(self.features)
        }
        
        # 6. XGBoost (advanced tree-based)
        print("6. XGBoost (advanced tree-based ensemble)...")
        xgb_model = xgb.XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            min_child_weight=5,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=1.2,
            random_state=42,
            eval_metric='logloss'
        )
        xgb_model.fit(X_train[self.features], y_train)
        xgb_pred = xgb_model.predict_proba(X_test[self.features])[:, 1]
        xgb_auc = roc_auc_score(y_test, xgb_pred)
        
        models_results['XGBoost'] = {
            'model': xgb_model,
            'auc': xgb_auc,
            'type': 'Non-Linear',
            'complexity': 'High',
            'features_used': len(self.features)
        }
        
        # 7. Extra Trees (randomized ensemble)
        print("7. Extra Trees (randomized tree ensemble)...")
        et = ExtraTreesClassifier(
            n_estimators=300,
            max_depth=15,
            min_samples_split=10,
            min_samples_leaf=5,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )
        et.fit(X_train[self.features], y_train)
        et_pred = et.predict_proba(X_test[self.features])[:, 1]
        et_auc = roc_auc_score(y_test, et_pred)
        
        models_results['Extra_Trees'] = {
            'model': et,
            'auc': et_auc,
            'type': 'Non-Linear', 
            'complexity': 'High',
            'features_used': len(self.features)
        }
        
        # 8. Neural Network (non-linear)
        print("8. Neural Network (multi-layer perceptron)...")
        mlp = MLPClassifier(
            hidden_layer_sizes=(100, 50),
            activation='relu',
            solver='adam',
            alpha=0.01,
            learning_rate='adaptive',
            max_iter=500,
            random_state=42
        )
        mlp.fit(X_train_scaled, y_train)
        mlp_pred = mlp.predict_proba(X_test_scaled)[:, 1]
        mlp_auc = roc_auc_score(y_test, mlp_pred)
        
        models_results['Neural_Network'] = {
            'model': mlp,
            'auc': mlp_auc,
            'type': 'Non-Linear',
            'complexity': 'High',
            'features_used': X_train.shape[1]
        }
        
        self.results['model_comparison'] = models_results
        
        return models_results
    
    def analyze_interaction_importance(self):
        """Analyze feature interactions in tree-based models"""
        print(f"\nSTEP 4: ANALYZING FEATURE INTERACTIONS IN TREE MODELS")
        print("-" * 60)
        
        # Get best tree-based model
        model_results = self.results['model_comparison']
        tree_models = {k: v for k, v in model_results.items() 
                      if v['type'] == 'Non-Linear' and 'Forest' in k or 'Boosting' in k or 'XGBoost' in k}
        
        if not tree_models:
            print("No tree-based models available for interaction analysis")
            return None
        
        best_tree_model_name = max(tree_models.keys(), key=lambda x: tree_models[x]['auc'])
        best_model = tree_models[best_tree_model_name]['model']
        
        print(f"Analyzing interactions in best tree model: {best_tree_model_name}")
        print(f"Model AUC: {tree_models[best_tree_model_name]['auc']:.4f}")
        
        # Feature importance analysis
        if hasattr(best_model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': self.features,
                'importance': best_model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            print(f"\nTOP 10 FEATURE IMPORTANCES IN TREE MODEL:")
            for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
                print(f"  {i+1:2d}. {row['feature']:25s}: {row['importance']:.4f}")
            
            self.results['tree_importance'] = feature_importance
        
        # Analyze two-way interactions using partial dependence
        top_features = self.results['top_features'][:4]  # Limit for computational efficiency
        
        interaction_effects = []
        
        print(f"\nANALYZING INTERACTION EFFECTS:")
        for i, feat1 in enumerate(top_features):
            for feat2 in top_features[i+1:]:
                try:
                    # Skip if features are too correlated
                    if feat1 in self.X.columns and feat2 in self.X.columns:
                        corr = np.corrcoef(self.X[feat1].fillna(0), self.X[feat2].fillna(0))[0, 1]
                        if abs(corr) > 0.8:
                            continue
                        
                        interaction_effects.append({
                            'feature1': feat1,
                            'feature2': feat2,
                            'correlation': corr,
                            'interaction_strength': abs(corr) * 0.5  # Proxy measure
                        })
                
                except Exception as e:
                    continue
        
        # Sort by interaction strength
        interaction_effects = sorted(interaction_effects, 
                                   key=lambda x: x['interaction_strength'], reverse=True)
        
        print(f"TOP 5 POTENTIAL INTERACTIONS DETECTED:")
        for i, interaction in enumerate(interaction_effects[:5]):
            print(f"  {i+1}. {interaction['feature1']} × {interaction['feature2']}: "
                  f"Strength = {interaction['interaction_strength']:.3f}")
        
        self.results['detected_interactions'] = interaction_effects
        
        return interaction_effects
    
    def cross_validation_comparison(self):
        """Cross-validation comparison of linear vs non-linear models"""
        print(f"\nSTEP 5: CROSS-VALIDATION STABILITY COMPARISON")
        print("-" * 55)
        
        # Select representative models for CV
        cv_models = {
            'Logistic_Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000),
            'Random_Forest': RandomForestClassifier(n_estimators=200, class_weight='balanced', 
                                                   random_state=42, n_jobs=-1),
            'XGBoost': xgb.XGBClassifier(n_estimators=200, random_state=42, eval_metric='logloss')
        }
        
        cv_results = {}
        cv_folds = 5
        
        print(f"CROSS-VALIDATION RESULTS ({cv_folds}-fold):")
        print("-" * 40)
        
        for model_name, model in cv_models.items():
            # Use original features for fair comparison
            cv_scores = cross_val_score(
                model, self.X[self.features], self.y,
                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
                scoring='roc_auc',
                n_jobs=-1
            )
            
            cv_results[model_name] = {
                'mean_auc': cv_scores.mean(),
                'std_auc': cv_scores.std(),
                'scores': cv_scores
            }
            
            print(f"{model_name:18s}: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        
        self.results['cross_validation'] = cv_results
        
        return cv_results
    
    def generate_h5_assessment(self):
        """Generate final assessment of H5: Complex Interactions"""
        print(f"\n" + "=" * 80)
        print("H5: COMPLEX INTERACTIONS HYPOTHESIS - FINAL ASSESSMENT")
        print("=" * 80)
        
        model_results = self.results['model_comparison']
        
        # Separate linear and non-linear models
        linear_models = {k: v for k, v in model_results.items() if v['type'] == 'Linear'}
        nonlinear_models = {k: v for k, v in model_results.items() if v['type'] == 'Non-Linear'}
        
        # Best performance in each category
        best_linear = max(linear_models.values(), key=lambda x: x['auc'])
        best_nonlinear = max(nonlinear_models.values(), key=lambda x: x['auc'])
        
        best_linear_name = [k for k, v in linear_models.items() if v['auc'] == best_linear['auc']][0]
        best_nonlinear_name = [k for k, v in nonlinear_models.items() if v['auc'] == best_nonlinear['auc']][0]
        
        print("MODEL PERFORMANCE SUMMARY:")
        print("-" * 30)
        print(f"Best Linear Model:     {best_linear_name:20s} AUC = {best_linear['auc']:.4f}")
        print(f"Best Non-Linear Model: {best_nonlinear_name:20s} AUC = {best_nonlinear['auc']:.4f}")
        
        # Calculate performance differences
        auc_difference = best_nonlinear['auc'] - best_linear['auc']
        relative_improvement = (auc_difference / best_linear['auc']) * 100
        
        print(f"\nPERFORMANCE COMPARISON:")
        print("-" * 25)
        print(f"AUC Difference:        {auc_difference:+.4f}")
        print(f"Relative Improvement:  {relative_improvement:+.2f}%")
        
        # Evidence criteria for H5
        evidence_criteria = {
            'nonlinear_superiority': auc_difference > 0.02,  # 2% improvement
            'substantial_improvement': relative_improvement > 3.0,  # 3% relative improvement
            'tree_ensemble_best': best_nonlinear_name in ['Random_Forest', 'XGBoost', 'Gradient_Boosting', 'Extra_Trees'],
            'interaction_detection': len(self.results.get('detected_interactions', [])) > 0,
            'cross_validation_stable': True  # Will check CV results
        }
        
        # Check cross-validation stability
        if 'cross_validation' in self.results:
            cv_results = self.results['cross_validation']
            nonlinear_cv = cv_results.get('XGBoost', cv_results.get('Random_Forest', {}))
            linear_cv = cv_results.get('Logistic_Regression', {})
            
            if nonlinear_cv and linear_cv:
                cv_difference = nonlinear_cv['mean_auc'] - linear_cv['mean_auc']
                evidence_criteria['cross_validation_stable'] = cv_difference > 0.01
        
        print(f"\nEVIDENCE ASSESSMENT:")
        print("-" * 20)
        evidence_count = 0
        for criterion, met in evidence_criteria.items():
            status = "✓ CONFIRMED" if met else "○ Not Met"
            print(f"{criterion.replace('_', ' ').title():25s}: {status}")
            if met:
                evidence_count += 1
        
        total_criteria = len(evidence_criteria)
        evidence_strength = evidence_count / total_criteria
        
        print(f"\nOVERALL EVIDENCE STRENGTH: {evidence_count}/{total_criteria} ({evidence_strength:.1%})")
        
        # Final determination
        if evidence_strength >= 0.8:  # 4/5 criteria
            h5_status = "FULLY ACHIEVED"
            explanation = "Strong evidence that non-linear models capture complex interactions better than linear models"
        elif evidence_strength >= 0.6:  # 3/5 criteria
            h5_status = "SUBSTANTIALLY ACHIEVED"
            explanation = "Good evidence for complex interactions with some limitations"
        elif evidence_strength >= 0.4:  # 2/5 criteria
            h5_status = "MODERATELY ACHIEVED"
            explanation = "Some evidence for complex interactions but not decisive"
        else:
            h5_status = "NOT ACHIEVED"
            explanation = "Limited evidence that non-linear models significantly outperform linear models"
        
        print(f"\nH5 HYPOTHESIS STATUS: {h5_status}")
        print(f"Explanation: {explanation}")
        
        # Detailed model rankings
        print(f"\nDETAILED MODEL RANKINGS:")
        print("-" * 30)
        all_models = sorted(model_results.items(), key=lambda x: x[1]['auc'], reverse=True)
        
        for i, (model_name, results) in enumerate(all_models):
            model_type = results['type']
            auc_score = results['auc']
            print(f"{i+1:2d}. {model_name:20s} ({model_type:10s}): {auc_score:.4f}")
        
        # Tree ensemble performance
        tree_models = [name for name, result in model_results.items() 
                      if 'Forest' in name or 'Boosting' in name or 'XGBoost' in name or 'Trees' in name]
        
        if tree_models:
            tree_aucs = [model_results[name]['auc'] for name in tree_models]
            avg_tree_auc = np.mean(tree_aucs)
            
            linear_models_list = [name for name, result in model_results.items() if result['type'] == 'Linear']
            linear_aucs = [model_results[name]['auc'] for name in linear_models_list]
            avg_linear_auc = np.mean(linear_aucs)
            
            print(f"\nENSEMBLE ANALYSIS:")
            print("-" * 20)
            print(f"Average Tree Ensemble AUC: {avg_tree_auc:.4f}")
            print(f"Average Linear Model AUC:   {avg_linear_auc:.4f}")
            print(f"Tree Ensemble Advantage:    {avg_tree_auc - avg_linear_auc:+.4f}")
        
        return {
            'status': h5_status,
            'evidence_strength': evidence_strength,
            'auc_difference': auc_difference,
            'best_linear': (best_linear_name, best_linear['auc']),
            'best_nonlinear': (best_nonlinear_name, best_nonlinear['auc']),
            'evidence_criteria': evidence_criteria
        }
    
    def run_complete_h5_analysis(self):
        """Execute complete H5 analysis pipeline"""
        try:
            # Load and prepare data
            self.load_and_prepare_data()
            
            # Detect potential interactions
            self.detect_potential_interactions()
            
            # Compare linear vs non-linear models
            self.compare_linear_vs_nonlinear_models()
            
            # Analyze interactions in tree models
            self.analyze_interaction_importance()
            
            # Cross-validation comparison
            self.cross_validation_comparison()
            
            # Final assessment
            assessment = self.generate_h5_assessment()
            
            return {
                'assessment': assessment,
                'detailed_results': self.results
            }
            
        except Exception as e:
            print(f"Error in H5 analysis: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Main execution
def main():
    data_path = r"C:\Users\USER\Desktop\MUKABUGINGO_THESIS_CODES\ANALYSIS\rwanda_dhs_processed.csv"
    
    print("EXECUTING H5: COMPLEX INTERACTIONS HYPOTHESIS ANALYSIS")
    print("Testing whether tree-based ensemble methods outperform linear models")
    
    analyzer = ComplexInteractionsAnalysis(data_path)
    results = analyzer.run_complete_h5_analysis()
    
    if results:
        print(f"\nANALYSIS COMPLETE!")
        print(f"H5 Status: {results['assessment']['status']}")
        
        if results['assessment']['status'] in ['FULLY ACHIEVED', 'SUBSTANTIALLY ACHIEVED']:
            print("Complex interactions hypothesis successfully demonstrated!")
            print("Tree-based ensemble methods show superior performance for capturing")
            print("non-linear relationships and feature interactions.")
        
        return results
    else:
        print("Analysis failed - please check data and parameters")
        return None

# Execute
if __name__ == "__main__":
    results = main()

EXECUTING H5: COMPLEX INTERACTIONS HYPOTHESIS ANALYSIS
Testing whether tree-based ensemble methods outperform linear models
H5: COMPLEX INTERACTIONS HYPOTHESIS - TESTING LINEAR VS NON-LINEAR
Dataset loaded: 14,634 rows, 66 columns
Analysis dataset: 14,634 observations, 28 features
Target distribution: {0: 7919, 1: 6715}

STEP 1: DETECTING POTENTIAL FEATURE INTERACTIONS
-------------------------------------------------------
TOP 10 FEATURES BY MUTUAL INFORMATION:
   1. v012                     : 0.2297
   2. v013                     : 0.2154
   3. v150                     : 0.1673
   4. age_education_interaction: 0.1474
   5. age_wealth_interaction   : 0.0367
   6. v714                     : 0.0322
   7. v149                     : 0.0300
   8. v106                     : 0.0225
   9. hv009                    : 0.0154
  10. v107                     : 0.0154

TOP 5 POTENTIAL INTERACTIONS:
  1. v012 × age_education_interaction: r = 0.692
  2. v013 × age_education_interaction: r = 0.682
  3.