In [1]:
#!pip install -q transformers accelerate bitsandbytes sentence-transformers datasets scipy torch torchaudio tqdm


In [2]:
"""
PROJECT NIKA: PHASE 6-E - MULTI-GEODESIC SEARCH
================================================================
"The Many Paths to Truth"

OBJECTIVE:
Discover MULTIPLE semantic paths between concepts, revealing:
1. Path Diversity (How many ways can concepts connect?)
2. Semantic Richness (What domains do paths traverse?)
3. Cognitive Flexibility (Which paths are most "natural"?)

INNOVATION:
Instead of finding ONE geodesic, we find TOP-K diverse paths
using beam search with diversity penalties.
"""

import torch
import torch.nn.functional as F
import numpy as np
import json
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM

# ============================================================================
# SECTION 1: DATA STRUCTURES
# ============================================================================

@dataclass
class SemanticPath:
    """Represents a single geodesic path through semantic space"""
    concepts: List[str]
    scores: List[float]  # Score at each step
    total_score: float
    total_entropy: float
    path_type: str  # "direct", "experiential", "abstract", etc.

    def __repr__(self):
        return f"Path({' → '.join(self.concepts[:3])}... score={self.total_score:.3f})"

# ============================================================================
# SECTION 2: THE MULTI-GEODESIC NAVIGATOR
# ============================================================================

# Stopword filter (from Phase 6-D)
STOPWORDS = {
    "the", "of", "and", "to", "a", "in", "is", "that", "it", "as", "was", "for",
    "with", "on", "by", "this", "are", "be", "or", "at", "from", "an", "which",
    "concept", "implies", "leads", "more", "most", "some", "any", "all",
    "connect", "connection", "between", "relating", "related", "there", "here",
    "having", "existence", "something", "its", "every"
}

class MultiGeodesicNavigator:
    """
    Finds multiple diverse paths between concepts using beam search.
    """

    def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
        print(f"🔧 Initializing Multi-Geodesic Navigator...")

        # Try to reuse existing model
        if 'model' in globals() and 'tokenizer' in globals():
            self.model = globals()['model']
            self.tokenizer = globals()['tokenizer']
            print("✓ Using cached model from memory")
        else:
            print(f"⚠️ Loading {model_name}...")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )

        self.device = self.model.device
        print(f"✓ Navigator ready on {self.device}\n")

    def get_embedding(self, word: str) -> torch.Tensor:
        """Extract embedding vector for a concept"""
        inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
        return outputs.hidden_states[-1][0, -1, :].float()

    def calculate_entropy(self, probs: torch.Tensor) -> float:
        """Shannon entropy of distribution"""
        return -torch.sum(probs * torch.log(probs + 1e-9)).item()

    def get_next_candidates(
        self,
        current_word: str,
        start_concept: str,
        target_vec: torch.Tensor,
        used_words: set,
        top_k: int = 100
    ) -> List[Tuple[str, float, float]]:
        """
        Get candidate next steps with scores.
        Returns: [(word, score, entropy), ...]
        """
        # List-based prompt (Phase 6-D innovation)
        prompt = f"Concepts: {start_concept}, {current_word},"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)

        logits = outputs.logits[0, -1, :].float()
        probs = F.softmax(logits, dim=-1)

        top_k_probs, top_k_ids = torch.topk(probs, top_k)

        candidates = []

        for i, token_id in enumerate(top_k_ids):
            word = self.tokenizer.decode(token_id).strip()
            word_clean = word.lower().replace(",", "").replace(".", "")

            # Strict filtering
            if not word_clean.isalpha() or len(word_clean) < 3:
                continue
            if word_clean in STOPWORDS:
                continue
            if word_clean in used_words:
                continue

            # Calculate metrics
            cand_vec = F.normalize(self.get_embedding(word_clean), dim=0)

            # Alignment with target
            alignment = torch.dot(cand_vec, target_vec).item()

            # Probability (manifold support)
            prob = top_k_probs[i].item()

            # Resistance
            resistance = 1.0 - prob

            # Score (Bhagavad Gita formula)
            score = (alignment * 0.7 + prob * 0.3) / (resistance + 0.5)

            candidates.append((word_clean, score, resistance))

        # Sort by score
        candidates.sort(key=lambda x: x[1], reverse=True)

        return candidates

    def calculate_path_diversity(self, path1: List[str], path2: List[str]) -> float:
        """
        Calculate diversity between two paths.
        Returns: 0.0 (identical) to 1.0 (completely different)
        """
        set1 = set(path1)
        set2 = set(path2)

        intersection = len(set1 & set2)
        union = len(set1 | set2)

        # Jaccard distance
        if union == 0:
            return 0.0
        return 1.0 - (intersection / union)

    def find_multiple_geodesics(
        self,
        start_concept: str,
        target_concept: str,
        num_paths: int = 5,
        max_steps: int = 8,
        diversity_weight: float = 0.3
    ) -> List[SemanticPath]:
        """
        Find multiple diverse paths using beam search with diversity penalty.

        Args:
            start_concept: Starting concept
            target_concept: Target concept
            num_paths: Number of diverse paths to find
            max_steps: Maximum steps per path
            diversity_weight: How much to penalize similar paths (0.0-1.0)

        Returns:
            List of SemanticPath objects, sorted by score
        """
        print(f"\n{'='*70}")
        print(f"MULTI-GEODESIC SEARCH: {start_concept} ⟶ {target_concept}")
        print(f"Searching for {num_paths} diverse paths (max {max_steps} steps)")
        print(f"{'='*70}\n")

        # Get target embedding
        target_vec = F.normalize(self.get_embedding(target_concept), dim=0)

        # Completed paths
        completed_paths: List[SemanticPath] = []

        # Active beams: [(current_word, path, total_score, total_entropy, used_words)]
        beams = [(start_concept, [start_concept], 0.0, 0.0, {start_concept.lower()})]

        for step in range(max_steps):
            new_beams = []

            for current_word, path, total_score, total_entropy, used_words in beams:
                # Check convergence
                curr_vec = F.normalize(self.get_embedding(current_word), dim=0)
                similarity = torch.dot(curr_vec, target_vec).item()

                if similarity > 0.85 or current_word.lower() == target_concept.lower():
                    # Path completed
                    completed_paths.append(SemanticPath(
                        concepts=path,
                        scores=[total_score],
                        total_score=total_score,
                        total_entropy=total_entropy,
                        path_type="unknown"  # Will classify later
                    ))
                    continue

                # Get candidates for next step
                candidates = self.get_next_candidates(
                    current_word,
                    start_concept,
                    target_vec,
                    used_words,
                    top_k=50
                )

                # Take top candidates
                for word, score, entropy in candidates[:10]:
                    new_path = path + [word]
                    new_used = used_words | {word}

                    # Apply diversity penalty
                    diversity_penalty = 0.0
                    if diversity_weight > 0:
                        for completed_path in completed_paths:
                            diversity = self.calculate_path_diversity(new_path, completed_path.concepts)
                            diversity_penalty += (1.0 - diversity) * diversity_weight

                    adjusted_score = total_score + score - diversity_penalty

                    new_beams.append((
                        word,
                        new_path,
                        adjusted_score,
                        total_entropy + entropy,
                        new_used
                    ))

            if not new_beams:
                break

            # Keep top beams
            new_beams.sort(key=lambda x: x[2], reverse=True)
            beams = new_beams[:num_paths * 3]  # Keep more beams than target paths

            # Early stopping if we have enough diverse paths
            if len(completed_paths) >= num_paths:
                # Check if they're diverse enough
                min_diversity = min(
                    self.calculate_path_diversity(completed_paths[i].concepts, completed_paths[j].concepts)
                    for i in range(len(completed_paths))
                    for j in range(i+1, len(completed_paths))
                    if i != j
                ) if len(completed_paths) > 1 else 1.0

                if min_diversity > 0.4:  # Threshold for "diverse enough"
                    print(f"✓ Found {len(completed_paths)} diverse paths (min diversity: {min_diversity:.2f})")
                    break

        # Add remaining beams as incomplete paths
        for current_word, path, total_score, total_entropy, _ in beams[:num_paths]:
            if len(completed_paths) < num_paths:
                completed_paths.append(SemanticPath(
                    concepts=path,
                    scores=[total_score],
                    total_score=total_score,
                    total_entropy=total_entropy,
                    path_type="incomplete"
                ))

        # Classify path types
        completed_paths = self._classify_paths(completed_paths, start_concept, target_concept)

        # Sort by score
        completed_paths.sort(key=lambda x: x.total_score, reverse=True)

        return completed_paths[:num_paths]

    def _classify_paths(
        self,
        paths: List[SemanticPath],
        start_concept: str,
        target_concept: str
    ) -> List[SemanticPath]:
        """
        Classify paths into types based on characteristics.
        Types: direct, abstract, experiential, poetic, analytical
        """
        for path in paths:
            if len(path.concepts) <= 3:
                path.path_type = "direct"
            elif any(word in ["feeling", "sense", "experience", "emotion"] for word in path.concepts):
                path.path_type = "experiential"
            elif any(word in ["theory", "principle", "logic", "reason"] for word in path.concepts):
                path.path_type = "analytical"
            elif any(word in ["beauty", "art", "music", "poetry"] for word in path.concepts):
                path.path_type = "poetic"
            else:
                path.path_type = "abstract"

        return paths

    def visualize_paths(
        self,
        paths: List[SemanticPath],
        start_concept: str,
        target_concept: str
    ):
        """Pretty print the discovered paths"""
        print(f"\n{'='*70}")
        print(f"DISCOVERED PATHS: {start_concept} ⟶ {target_concept}")
        print(f"{'='*70}\n")

        for i, path in enumerate(paths, 1):
            print(f"PATH {i} [{path.path_type.upper()}] (score: {path.total_score:.3f}, entropy: {path.total_entropy:.3f})")
            print("  " + " → ".join(path.concepts))
            print()

        # Calculate diversity matrix
        if len(paths) > 1:
            print(f"{'='*70}")
            print("PATH DIVERSITY MATRIX")
            print(f"{'='*70}\n")

            print("     ", end="")
            for i in range(len(paths)):
                print(f"  P{i+1}  ", end="")
            print()

            for i in range(len(paths)):
                print(f"P{i+1}  ", end="")
                for j in range(len(paths)):
                    if i == j:
                        print("  -   ", end="")
                    else:
                        diversity = self.calculate_path_diversity(
                            paths[i].concepts,
                            paths[j].concepts
                        )
                        print(f" {diversity:.2f} ", end="")
                print()
            print()

# ============================================================================
# SECTION 3: ADVANCED ANALYSIS TOOLS
# ============================================================================

class PathAnalyzer:
    """Analyzes semantic paths for patterns and insights"""

    @staticmethod
    def analyze_path_characteristics(paths: List[SemanticPath]) -> Dict:
        """
        Analyze characteristics of discovered paths.
        """
        analysis = {
            "num_paths": len(paths),
            "avg_length": np.mean([len(p.concepts) for p in paths]),
            "avg_score": np.mean([p.total_score for p in paths]),
            "avg_entropy": np.mean([p.total_entropy for p in paths]),
            "path_types": {},
            "common_concepts": {},
        }

        # Count path types
        for path in paths:
            analysis["path_types"][path.path_type] = \
                analysis["path_types"].get(path.path_type, 0) + 1

        # Find common concepts across paths
        all_concepts = []
        for path in paths:
            all_concepts.extend(path.concepts[1:-1])  # Exclude start/end

        from collections import Counter
        concept_counts = Counter(all_concepts)
        analysis["common_concepts"] = dict(concept_counts.most_common(5))

        return analysis

    @staticmethod
    def find_semantic_hubs(paths: List[SemanticPath]) -> List[Tuple[str, int]]:
        """
        Identify concepts that appear in multiple paths (semantic hubs).
        """
        concept_freq = {}
        for path in paths:
            for concept in path.concepts[1:-1]:  # Exclude endpoints
                concept_freq[concept] = concept_freq.get(concept, 0) + 1

        # Return concepts appearing in multiple paths
        hubs = [(k, v) for k, v in concept_freq.items() if v > 1]
        hubs.sort(key=lambda x: x[1], reverse=True)
        return hubs

# ============================================================================
# SECTION 4: EXPERIMENTAL SCENARIOS
# ============================================================================

def run_multi_geodesic_experiments():
    """Run comprehensive multi-geodesic experiments"""

    navigator = MultiGeodesicNavigator()
    analyzer = PathAnalyzer()

    # Define test scenarios
    scenarios = [
        {
            "name": "The Philosophical Divide",
            "start": "Logic",
            "target": "Emotion",
            "description": "Can we bridge reason and feeling?"
        },
        {
            "name": "The Creative Leap",
            "start": "Science",
            "target": "Art",
            "description": "How do analytical and creative domains connect?"
        },
        {
            "name": "The Spiritual Bridge",
            "start": "Matter",
            "target": "Spirit",
            "description": "Connecting physical and metaphysical"
        },
        {
            "name": "The Social Transformation",
            "start": "Individual",
            "target": "Society",
            "description": "From self to collective"
        }
    ]

    all_results = {}

    for scenario in scenarios:
        print(f"\n{'#'*70}")
        print(f"# SCENARIO: {scenario['name']}")
        print(f"# {scenario['description']}")
        print(f"{'#'*70}")

        # Find multiple paths
        paths = navigator.find_multiple_geodesics(
            start_concept=scenario['start'],
            target_concept=scenario['target'],
            num_paths=5,
            max_steps=8,
            diversity_weight=0.3
        )

        # Visualize
        navigator.visualize_paths(paths, scenario['start'], scenario['target'])

        # Analyze
        analysis = analyzer.analyze_path_characteristics(paths)
        hubs = analyzer.find_semantic_hubs(paths)

        print(f"{'='*70}")
        print("ANALYSIS")
        print(f"{'='*70}\n")
        print(f"Average path length: {analysis['avg_length']:.1f} steps")
        print(f"Average score: {analysis['avg_score']:.3f}")
        print(f"Average entropy: {analysis['avg_entropy']:.3f}")
        print(f"\nPath type distribution:")
        for ptype, count in analysis['path_types'].items():
            print(f"  {ptype}: {count}")

        if hubs:
            print(f"\nSemantic hubs (concepts appearing in multiple paths):")
            for concept, freq in hubs[:3]:
                print(f"  {concept}: appears in {freq} paths")

        all_results[scenario['name']] = {
            "paths": [{"concepts": p.concepts, "score": p.total_score, "type": p.path_type}
                     for p in paths],
            "analysis": analysis,
            "hubs": hubs
        }

    # Save results
    output_file = "/tmp/multi_geodesic_results.json"
    with open(output_file, "w") as f:
        json.dump(all_results, f, indent=2)

    print(f"\n{'='*70}")
    print(f"✓ Results saved to {output_file}")
    print(f"{'='*70}\n")

    return all_results

# ============================================================================
# SECTION 5: MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("""
    ╔═══════════════════════════════════════════════════════════════╗
    ║  PROJECT NIKA: PHASE 6-E                                      ║
    ║  MULTI-GEODESIC SEARCH                                        ║
    ║  "The Many Paths to Truth"                                    ║
    ╚═══════════════════════════════════════════════════════════════╝
    """)

    # Run experiments
    results = run_multi_geodesic_experiments()

    print("""
    ╔═══════════════════════════════════════════════════════════════╗
    ║  EXPERIMENT COMPLETE                                          ║
    ║  Discovered multiple semantic geodesics across 4 scenarios    ║
    ╚═══════════════════════════════════════════════════════════════╝
    """)


    ╔═══════════════════════════════════════════════════════════════╗
    ║  PROJECT NIKA: PHASE 6-E                                      ║
    ║  MULTI-GEODESIC SEARCH                                        ║
    ║  "The Many Paths to Truth"                                    ║
    ╚═══════════════════════════════════════════════════════════════╝
    
🔧 Initializing Multi-Geodesic Navigator...
⚠️ Loading Qwen/Qwen2.5-7B-Instruct...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



✓ Navigator ready on cuda:0


######################################################################
# SCENARIO: The Philosophical Divide
# Can we bridge reason and feeling?
######################################################################

MULTI-GEODESIC SEARCH: Logic ⟶ Emotion
Searching for 5 diverse paths (max 8 steps)


DISCOVERED PATHS: Logic ⟶ Emotion

PATH 1 [ABSTRACT] (score: 2.539, entropy: 7.522)
  Logic → language → communication → empathy → creativity → problem → solution → understand → relationship

PATH 2 [ABSTRACT] (score: 2.539, entropy: 7.478)
  Logic → language → communication → creativity → problem → solution → thinking → intelligence → relationship

PATH 3 [ABSTRACT] (score: 2.537, entropy: 7.451)
  Logic → language → communication → creativity → problem → solution → formula → relation → structure

PATH 4 [ABSTRACT] (score: 2.536, entropy: 7.464)
  Logic → language → communication → creativity → problem → solution → understand → relationship → structure

PATH 5