In [24]:
# =============================================================================
# REVISED CODE: DYNAMIC PHASE 3 BASED ON EXISTING EXPERIMENT 1 & 2 RESULTS
# =============================================================================

import json
import numpy as np
from scipy import stats as scipy_stats

class DynamicPhase3ConfigGenerator:
    """Generate Phase 3 configuration based on existing Phase 1 & 2 results"""

    def __init__(self, results_file_path=None, phase1_results=None, phase2_results=None):
        """
        Initialize with either:
        1. Path to JSON results file from previous experiments, OR
        2. Direct results dictionaries from Phase 1 & 2
        """
        if results_file_path:
            with open(results_file_path, 'r') as f:
                all_results = json.load(f)
            self.phase1_results, self.phase2_results = self._extract_phases(all_results)
        else:
            self.phase1_results = phase1_results
            self.phase2_results = phase2_results

    def _extract_phases(self, all_results):
        """Extract Phase 1 and Phase 2 results from combined results"""
        phase1_configs = ['baseline', 'with_artists', 'with_users', 'full_graph']
        phase2_configs = ['features_basic', 'features_audio']

        phase1_results = {k: v for k, v in all_results.items() if k in phase1_configs}
        phase2_results = {k: v for k, v in all_results.items() if k in phase2_configs}

        return phase1_results, phase2_results

    def determine_best_edge_types(self, metric='ndcg@10', significance_threshold=0.05, min_effect_size=0.2):
        """
        Determine best edge types from Phase 1 results

        Args:
            metric: Performance metric to optimize ('ndcg@10', 'precision@10', etc.)
            significance_threshold: p-value threshold for statistical significance
            min_effect_size: Minimum Cohen's d for practical significance
        """
        print(f"🔍 Analyzing Phase 1 results to determine best edge types...")
        print(f"   📊 Metric: {metric}")
        print(f"   📈 Significance threshold: {significance_threshold}")
        print(f"   📏 Minimum effect size: {min_effect_size}")

        # Extract performance scores for each configuration
        config_scores = {}
        config_details = {}

        for config_name, results in self.phase1_results.items():
            if 'statistics' in results and metric in results['statistics']:
                mean_score = results['statistics'][metric]['mean']
                config_scores[config_name] = mean_score
                config_details[config_name] = {
                    'edge_types': results['config']['edge_types'],
                    'mean': mean_score,
                    'std': results['statistics'][metric].get('std', 0),
                    'n': results['statistics'][metric].get('n', 5)
                }
                print(f"      {config_name}: {mean_score:.4f} ± {results['statistics'][metric].get('std', 0):.4f}")

        # Find the configuration with highest performance
        best_config_name = max(config_scores, key=config_scores.get)
        best_edge_types = config_details[best_config_name]['edge_types']

        print(f"\n   🏆 Highest performing: {best_config_name} ({config_scores[best_config_name]:.4f})")
        print(f"   🔗 Edge types: {best_edge_types}")

        # Validate statistical significance vs baseline
        if 'baseline' in config_details and best_config_name != 'baseline':
            # Simulate t-test (since we only have summary statistics)
            baseline_mean = config_details['baseline']['mean']
            baseline_std = config_details['baseline']['std']
            best_mean = config_details[best_config_name]['mean']
            best_std = config_details[best_config_name]['std']
            n = config_details[best_config_name]['n']

            # Calculate effect size (Cohen's d)
            pooled_std = np.sqrt((baseline_std**2 + best_std**2) / 2)
            cohens_d = (best_mean - baseline_mean) / pooled_std if pooled_std > 0 else 0

            # Estimate p-value using effect size and sample size
            se_diff = pooled_std * np.sqrt(2/n)
            t_stat = (best_mean - baseline_mean) / se_diff if se_diff > 0 else 0
            p_value = 2 * (1 - scipy_stats.t.cdf(abs(t_stat), df=2*n-2))

            improvement_pct = ((best_mean - baseline_mean) / baseline_mean * 100) if baseline_mean > 0 else 0

            print(f"\n   📊 Statistical Analysis vs Baseline:")
            print(f"      Improvement: {improvement_pct:+.1f}%")
            print(f"      Cohen's d: {cohens_d:.3f}")
            print(f"      Estimated p-value: {p_value:.4f}")

            # Check if improvement is significant and meaningful
            if p_value < significance_threshold and cohens_d >= min_effect_size:
                print(f"   ✅ {best_config_name} is statistically and practically significant!")
                return best_edge_types, {
                    'selected_config': best_config_name,
                    'performance': best_mean,
                    'improvement_vs_baseline': improvement_pct,
                    'statistical_significance': True,
                    'p_value': p_value,
                    'effect_size': cohens_d
                }
            else:
                print(f"   ⚠️  Improvement not significant enough, falling back to baseline")
                return config_details['baseline']['edge_types'], {
                    'selected_config': 'baseline',
                    'performance': baseline_mean,
                    'fallback_reason': f"p={p_value:.4f} > {significance_threshold} or d={cohens_d:.3f} < {min_effect_size}"
                }
        else:
            return best_edge_types, {
                'selected_config': best_config_name,
                'performance': config_scores[best_config_name],
                'note': 'No baseline comparison available'
            }

    def determine_best_features(self, metric='ndcg@10', min_improvement_threshold=0.05):
        """
        Determine best feature types from Phase 2 results

        Args:
            metric: Performance metric to optimize
            min_improvement_threshold: Minimum improvement to justify using features (5% default)
        """
        print(f"\n🔍 Analyzing Phase 2 results to determine best features...")
        print(f"   📊 Metric: {metric}")
        print(f"   📈 Minimum improvement threshold: {min_improvement_threshold*100}%")

        # Extract feature performance
        feature_scores = {}
        feature_details = {}

        for config_name, results in self.phase2_results.items():
            if 'statistics' in results and metric in results['statistics']:
                mean_score = results['statistics'][metric]['mean']
                feature_scores[config_name] = mean_score
                feature_details[config_name] = {
                    'feature_types': results['config']['feature_types'],
                    'mean': mean_score,
                    'std': results['statistics'][metric].get('std', 0)
                }
                print(f"      {config_name}: {mean_score:.4f} ± {results['statistics'][metric].get('std', 0):.4f}")

        if not feature_scores:
            print("   ❌ No Phase 2 results found!")
            return [], {'selected_config': 'no_features', 'reason': 'No Phase 2 results available'}

        # Find best feature configuration
        best_feature_config = max(feature_scores, key=feature_scores.get)
        best_feature_types = feature_details[best_feature_config]['feature_types']
        best_score = feature_scores[best_feature_config]

        print(f"\n   🏆 Highest performing: {best_feature_config} ({best_score:.4f})")
        print(f"   🎯 Feature types: {best_feature_types}")

        # Check if features provide meaningful improvement
        # (Compare against baseline performance from Phase 1 if available)
        baseline_performance = None
        if self.phase1_results and 'baseline' in self.phase1_results:
            baseline_stats = self.phase1_results['baseline'].get('statistics', {})
            baseline_performance = baseline_stats.get(metric, {}).get('mean', 0)

        if baseline_performance:
            improvement = (best_score - baseline_performance) / baseline_performance
            print(f"   📈 Improvement vs structure-only baseline: {improvement*100:+.1f}%")

            if improvement >= min_improvement_threshold:
                print(f"   ✅ Features provide meaningful improvement!")
                return best_feature_types, {
                    'selected_config': best_feature_config,
                    'performance': best_score,
                    'improvement_vs_baseline': improvement*100,
                    'use_features': True
                }
            else:
                print(f"   ⚠️  Feature improvement below threshold, not using features")
                return [], {
                    'selected_config': 'no_features',
                    'reason': f'Improvement {improvement*100:.1f}% < {min_improvement_threshold*100}% threshold'
                }
        else:
            # If no baseline comparison, use features if they exist
            return best_feature_types, {
                'selected_config': best_feature_config,
                'performance': best_score,
                'note': 'No baseline comparison available, using best feature config'
            }

    def generate_dynamic_phase3_config(self, metric='ndcg@10'):
        """Generate the optimal Phase 3 configuration based on empirical results"""
        print("="*80)
        print("🔬 GENERATING DYNAMIC PHASE 3 CONFIGURATION")
        print("="*80)

        # Determine best edge types from Phase 1
        best_edge_types, edge_analysis = self.determine_best_edge_types(metric=metric)

        # Determine best feature types from Phase 2
        best_feature_types, feature_analysis = self.determine_best_features(metric=metric)

        # Create the dynamic configuration
        dynamic_config = {
            "name": "Dynamic Best Combined",
            "description": f"Empirically determined best configuration based on {metric}",
            "edge_types": best_edge_types,
            "use_features": len(best_feature_types) > 0,
            "feature_types": best_feature_types
        }

        # Create analysis summary
        analysis_summary = {
            'optimization_metric': metric,
            'edge_selection': edge_analysis,
            'feature_selection': feature_analysis,
            'final_config': dynamic_config
        }

        print(f"\n🎯 FINAL DYNAMIC CONFIGURATION:")
        print(f"   📊 Optimization metric: {metric}")
        print(f"   🔗 Selected edge types: {best_edge_types}")
        print(f"   🎯 Selected feature types: {best_feature_types}")
        print(f"   🔄 Use features: {len(best_feature_types) > 0}")

        return dynamic_config, analysis_summary

# =============================================================================
# REVISED EXPERIMENT RUNNER FOR PHASE 3 ONLY
# =============================================================================

class Phase3OnlyExperimentRunner:
    """Run only Phase 3 with dynamically determined configuration"""

    def __init__(self, config, data, existing_results_path=None):
        self.config = config
        self.data = data
        self.trainer = ImprovedExperimentTrainer(config, data)
        self.existing_results_path = existing_results_path

    def run_dynamic_phase3_experiment(self, optimization_metric='ndcg@10'):
        """Run Phase 3 experiment with dynamically determined configuration"""

        print("🚀 STARTING DYNAMIC PHASE 3 EXPERIMENT")
        print("="*60)

        # 1. Generate dynamic configuration based on existing results
        if self.existing_results_path:
            config_generator = DynamicPhase3ConfigGenerator(
                results_file_path=self.existing_results_path
            )
        else:
            print("❌ Error: No existing results file provided!")
            return None

        # 2. Determine optimal configuration
        dynamic_config, analysis_summary = config_generator.generate_dynamic_phase3_config(
            metric=optimization_metric
        )

        # 3. Run the experiment with the dynamic configuration
        print(f"\n🧪 Running experiment with dynamic configuration...")

        phase3_results = []

        # Run with multiple seeds for statistical robustness
        for seed_idx, seed in enumerate(self.config.random_seeds):
            print(f"\n🎲 Seed {seed_idx+1}/{len(self.config.random_seeds)} (seed={seed}):")

            # Train model
            training_result = self.trainer.train_model(dynamic_config, seed=seed)

            if training_result is None:
                print(f"   ❌ Training failed for seed {seed}")
                continue

            # Evaluate on test set
            test_metrics = self.trainer.evaluate_model(
                training_result['model'],
                training_result['adj_matrix'],
                'test',
                seed=seed
            )

            # Store results
            run_result = {
                'seed': seed,
                'metrics': test_metrics,
                'training_time': training_result['training_time'],
                'final_loss': training_result['final_loss'],
                'best_val_loss': training_result['best_val_loss']
            }
            phase3_results.append(run_result)

            # Print immediate results
            print(f"      📊 NDCG@10: {test_metrics.get('ndcg@10', 0):.4f}")
            print(f"      📊 AUC: {test_metrics.get('auc', 0):.4f}")
            print(f"      ⏱️ Time: {training_result['training_time']:.1f}s")

            # Memory cleanup
            del training_result
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # 4. Calculate statistics for Phase 3 results
        if phase3_results:
            phase3_stats = self._calculate_statistics(phase3_results)

            final_results = {
                'dynamic_best_combined': {
                    'config': dynamic_config,
                    'runs': phase3_results,
                    'statistics': phase3_stats
                }
            }

            # 5. Print summary
            self._print_phase3_summary(final_results, analysis_summary)

            return {
                'results': final_results,
                'analysis_summary': analysis_summary,
                'optimization_metric': optimization_metric
            }
        else:
            print("❌ No successful runs in Phase 3!")
            return None

    def _calculate_statistics(self, phase3_results):
        """Calculate statistics for Phase 3 results"""
        statistics = {}

        # Get all metric names
        all_metrics = set()
        for run in phase3_results:
            all_metrics.update(run['metrics'].keys())

        # Calculate statistics for each metric
        for metric in all_metrics:
            values = [run['metrics'].get(metric, 0) for run in phase3_results]

            if values and len(values) > 1:
                mean_val = np.mean(values)
                std_val = np.std(values, ddof=1)
                n = len(values)

                # Calculate confidence intervals
                if n > 2:
                    t_value = scipy_stats.t.ppf(0.975, n-1)
                    margin_error = t_value * std_val / np.sqrt(n)
                    ci_lower = mean_val - margin_error
                    ci_upper = mean_val + margin_error
                else:
                    ci_lower = mean_val
                    ci_upper = mean_val

                statistics[metric] = {
                    'mean': mean_val,
                    'std': std_val,
                    'min': np.min(values),
                    'max': np.max(values),
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper,
                    'n': n
                }

        return statistics

    def _print_phase3_summary(self, results, analysis_summary):
        """Print comprehensive Phase 3 summary"""
        print(f"\n" + "="*80)
        print(f"📊 DYNAMIC PHASE 3 RESULTS SUMMARY")
        print(f"="*80)

        config_result = results['dynamic_best_combined']
        stats = config_result['statistics']

        # Print configuration details
        print(f"\n🎯 CONFIGURATION USED:")
        print(f"   📊 Optimization metric: {analysis_summary['optimization_metric']}")
        print(f"   🔗 Edge types: {config_result['config']['edge_types']}")
        print(f"   🎯 Feature types: {config_result['config']['feature_types']}")
        print(f"   🔄 Uses features: {config_result['config']['use_features']}")

        # Print performance results
        ndcg_mean = stats.get('ndcg@10', {}).get('mean', 0)
        ndcg_std = stats.get('ndcg@10', {}).get('std', 0)
        ndcg_ci_lower = stats.get('ndcg@10', {}).get('ci_lower', 0)
        ndcg_ci_upper = stats.get('ndcg@10', {}).get('ci_upper', 0)

        print(f"\n📈 PERFORMANCE RESULTS:")
        print(f"   NDCG@10: {ndcg_mean:.4f} ± {ndcg_std:.4f}")
        print(f"   95% CI: [{ndcg_ci_lower:.4f}, {ndcg_ci_upper:.4f}]")

        if 'auc' in stats:
            auc_mean = stats['auc']['mean']
            auc_std = stats['auc']['std']
            print(f"   AUC: {auc_mean:.4f} ± {auc_std:.4f}")

        # Print selection rationale
        print(f"\n🔍 SELECTION RATIONALE:")
        edge_rationale = analysis_summary['edge_selection']
        feature_rationale = analysis_summary['feature_selection']

        print(f"   🔗 Edge Selection: {edge_rationale.get('selected_config', 'unknown')}")
        if 'improvement_vs_baseline' in edge_rationale:
            print(f"      Improvement vs baseline: {edge_rationale['improvement_vs_baseline']:+.1f}%")

        print(f"   🎯 Feature Selection: {feature_rationale.get('selected_config', 'unknown')}")
        if 'improvement_vs_baseline' in feature_rationale:
            print(f"      Improvement vs baseline: {feature_rationale['improvement_vs_baseline']:+.1f}%")

# =============================================================================
# USAGE EXAMPLE
# =============================================================================

def run_dynamic_phase3_only(existing_results_file, target_playlists=1500, optimization_metric='ndcg@10'):
    """
    Run only Phase 3 with configuration determined from existing Phase 1 & 2 results

    Args:
        existing_results_file: Path to JSON file containing Phase 1 & 2 results
        target_playlists: Number of playlists for the experiment
        optimization_metric: Metric to optimize for ('ndcg@10', 'precision@10', etc.)
    """

    try:
        # Initialize configuration
        config = FixedExperimentConfig()
        config.target_playlists = target_playlists

        # Generate data (same as before)
        print(f"🎵 Generating data for {target_playlists} playlists...")
        data_generator = ImprovedSyntheticMusicDataGenerator(config)
        data = data_generator.generate_heterogeneous_data(seed=42)

        # Initialize Phase 3 runner
        runner = Phase3OnlyExperimentRunner(
            config=config,
            data=data,
            existing_results_path=existing_results_file
        )

        # Run dynamic Phase 3 experiment
        results = runner.run_dynamic_phase3_experiment(
            optimization_metric=optimization_metric
        )

        if results:
            # Save results
            output_file = f"dynamic_phase3_results_{optimization_metric}.json"

            # Convert results to serializable format
            serializable_results = {
                'dynamic_phase3': results['results'],
                'analysis_summary': results['analysis_summary'],
                'optimization_metric': optimization_metric
            }

            with open(output_file, 'w') as f:
                json.dump(serializable_results, f, indent=2, default=str)

            print(f"\n💾 Results saved to: {output_file}")
            print(f"🎉 DYNAMIC PHASE 3 EXPERIMENT COMPLETED SUCCESSFULLY!")

            return results
        else:
            print(f"❌ Dynamic Phase 3 experiment failed!")
            return None

    except Exception as e:
        print(f"❌ Error in dynamic Phase 3 experiment: {e}")
        import traceback
        traceback.print_exc()
        return None

# =============================================================================
# EXAMPLE USAGE
# =============================================================================

if __name__ == "__main__":
    # Example: Run Phase 3 only using existing results

    # Option 1: Use existing results file
    results = run_dynamic_phase3_only(
        existing_results_file="../results/fixed_lightgcn_experiments/improved_experiment_results.json",
        target_playlists=1500,
        optimization_metric='ndcg@10'
    )

    # Option 2: If you want to try different optimization metrics
    # results_precision = run_dynamic_phase3_only(
    #     existing_results_file="../results/fixed_lightgcn_experiments/improved_experiment_results.json",
    #     optimization_metric='precision@10'
    # )

print("\n" + "="*80)
print("🎯 DYNAMIC PHASE 3 FRAMEWORK READY")
print("="*80)
print("🚀 USAGE:")
print("   # Run Phase 3 only with empirically determined 'best' configuration")
print("   results = run_dynamic_phase3_only(")
print("       existing_results_file='path/to/your/phase1_phase2_results.json',")
print("       optimization_metric='ndcg@10'")
print("   )")
print()
print("🔧 FEATURES:")
print("   ✅ Empirically determines best edge types from Phase 1 results")
print("   ✅ Empirically determines best feature types from Phase 2 results")
print("   ✅ Statistical significance testing for configuration selection")
print("   ✅ Effect size analysis for practical significance")
print("   ✅ Configurable optimization metrics")
print("   ✅ Comprehensive analysis and rationale reporting")
print("   ✅ Only runs Phase 3 - no need to repeat Phase 1 & 2")
print("="*80)

🎯 Fixed Configuration loaded:
   📱 Device: cpu
   🎲 Seeds: 5 seeds
   📊 Target playlists: 1,500
   🧠 Embedding dim: 128
   ⚡ Learning rate: 0.0005
   🛡️ Regularization: 0.001
🎵 Generating data for 1500 playlists...
🎵 Improved Synthetic Data Generator:
   📊 Playlists: 1,500
   🎵 Tracks: 4,500
   🎤 Artists: 500
   💿 Albums: 500
   👥 Users: 187
🔧 Generating improved heterogeneous music data (seed=42)...
   🔢 Total nodes: 7,187
   🔗 Generating improved edge distributions...
      ✅ playlist_track: 27,478 edges
      ✅ track_artist: 5,171 edges
      ✅ track_album: 4,500 edges
      ✅ user_playlist: 981 edges
   📊 Generating balanced splits...
      ✅ train: 18,579 edges
      ✅ val: 4,024 edges
      ✅ test: 4,875 edges
   🎯 Generating correlated features...
      ✅ Generated correlated features for all node types
🎯 Improved Experiment Trainer initialized:
   👥 Playlists: 1,500
   🎵 Tracks: 4,500
   🔢 Total nodes: 7,187
🚀 STARTING DYNAMIC PHASE 3 EXPERIMENT
🔬 GENERATING DYNAMIC PHASE 3 CON