In [None]:
# Install required packages
!pip install spacy networkx node2vec torch transformers requests beautifulsoup4
!pip install nebula3-python
!python -m spacy download en_core_web_sm
# Import essential libraries
import spacy
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from node2vec import Node2Vec
import requests
import re
from collections import defaultdict, deque
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_recall_fscore_support
import json
import pickle


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
class DataCollector:
    """Downloads and preprocesses War and Peace from Project Gutenberg"""

    def __init__(self):
        self.url = "https://www.gutenberg.org/files/2600/2600-0.txt"
        self.text = None

    def download_text(self):
        """Download War and Peace text"""
        try:
            response = requests.get(self.url)
            response.raise_for_status()
            self.text = response.text
            print("✅ Successfully downloaded War and Peace!")
            return True
        except Exception as e:
            print(f"❌ Error downloading text: {e}")
            # Fallback sample text for demonstration
            self.text = """
            Anna Pavlovna smiled and promised to take care of Pierre. She knew his father.
            The guests began to disperse, some without taking leave of Anna Pavlovna.
            He was taller than anyone in the room. His expression was intelligent and his gaze was kind.
            """
            return False

    def clean_text(self):
        """Clean and preprocess the text"""
        if not self.text:
            return ""

        # Remove Project Gutenberg header/footer
        start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
        start_idx = self.text.find(start_marker)
        end_idx = self.text.find(end_marker)

        if start_idx != -1 and end_idx != -1:
            self.text = self.text[start_idx:end_idx]

        # Basic cleaning
        self.text = re.sub(r'\n+', ' ', self.text)  # Replace multiple newlines
        self.text = re.sub(r'\s+', ' ', self.text)  # Normalize whitespace
        self.text = self.text.strip()

        return self.text[:50000]  # Use first 50k characters for demo


# ✅ Initialize and run data collection
collector = DataCollector()
collector.download_text()
clean_text = collector.clean_text()
print(f"📊 Text length: {len(clean_text)} characters")

✅ Successfully downloaded War and Peace!
📊 Text length: 50000 characters


In [None]:
class SVOExtractor:
    """Extracts Subject-Verb-Object triplets using spaCy"""

    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.triplets = []
        self.sentence_mapping = {}  # Maps triplets to sentence IDs

    def extract_subject(self, token):
        """Extract subject from dependency tree"""
        for child in token.children:
            if "subj" in child.dep_:
                return " ".join([w.text for w in child.subtree])
        return None

    def extract_object(self, token):
        """Extract object from dependency tree"""
        for child in token.children:
            if "obj" in child.dep_:
                return " ".join([w.text for w in child.subtree])
        return None

    def extract_triplets_from_sentence(self, sentence, sentence_id):
        """Extract SVO triplets from a single sentence"""
        doc = self.nlp(sentence)
        sentence_triplets = []

        for token in doc:
            if token.pos_ == "VERB":
                subject = self.extract_subject(token)
                obj = self.extract_object(token)
                verb = token.lemma_
                if subject and obj:
                    triplet = (subject.lower().strip(), verb.lower(), obj.lower().strip())
                    sentence_triplets.append(triplet)
                    self.sentence_mapping[triplet] = sentence_id
        return sentence_triplets

    def process_text(self, text):
        """Process entire text and extract all triplets"""
        doc = self.nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents]
        all_triplets = []

        for i, sentence in enumerate(sentences[:100]):  # Limit for demo
            triplets = self.extract_triplets_from_sentence(sentence, i)
            all_triplets.extend(triplets)

        self.triplets.extend(all_triplets)
        print(f"🔍 Extracted {len(all_triplets)} SVO triplets from {len(sentences)} sentences")
        return all_triplets, sentences


# ✅ Extract SVO triplets
extractor = SVOExtractor()
triplets, sentences = extractor.process_text(clean_text)

print("📝 Sample triplets:")
for i, triplet in enumerate(triplets[:5]):
    print(f" {i+1}. {triplet}")

🔍 Extracted 52 SVO triplets from 462 sentences
📝 Sample triplets:
 1. ('chapter xxiv chapter xxv chapter xxvi', 'book', 'seven')
 2. ('i', 'warn', 'you')
 3. ('you', 'tell', 'me')
 4. ('this', 'mean', 'war')
 5. ('i', 'have', 'nothing more to do with you')


In [None]:
class KnowledgeGraphBuilder:
    """Builds knowledge graph from SVO triplets"""

    def __init__(self):
        self.graph = nx.DiGraph()
        self.node_to_sentences = defaultdict(set)
        self.edge_to_sentences = defaultdict(set)

    def build_graph(self, triplets, sentence_mapping):
        """Build NetworkX graph from triplets"""
        for triplet in triplets:
            subject, verb, obj = triplet
            sentence_id = sentence_mapping.get(triplet, -1)

            # Add nodes
            self.graph.add_node(subject, type='entity')
            self.graph.add_node(obj, type='entity')

            # Add edge with verb as relationship
            self.graph.add_edge(subject, obj, relation=verb, sentence_id=sentence_id)

            # Track sentence membership
            self.node_to_sentences[subject].add(sentence_id)
            self.node_to_sentences[obj].add(sentence_id)
            self.edge_to_sentences[(subject, obj)].add(sentence_id)

        print(f"🕸️ Built graph with {self.graph.number_of_nodes()} nodes and {self.graph.number_of_edges()} edges")
        return self.graph

    def get_node_features(self):
        """Generate node features for entropy calculation"""
        features = {}
        for node in self.graph.nodes():
            features[node] = {
                'degree': self.graph.degree(node),
                'in_degree': self.graph.in_degree(node),
                'out_degree': self.graph.out_degree(node),
                'sentence_count': len(self.node_to_sentences[node]),
                'neighbors': len(list(self.graph.neighbors(node)))
            }
        return features


# ✅ Build knowledge graph
kg_builder = KnowledgeGraphBuilder()
knowledge_graph = kg_builder.build_graph(triplets, extractor.sentence_mapping)
node_features = kg_builder.get_node_features()

🕸️ Built graph with 68 nodes and 50 edges


In [None]:
class EntropyModel(nn.Module):
    """BLT-inspired entropy model for word-level boundary detection"""

    def __init__(self, vocab_size=10000, embedding_dim=128, hidden_dim=256):
        super(EntropyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.entropy_head = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_sequence):
        """Calculate entropy for input sequence"""
        embedded = self.embedding(input_sequence)
        lstm_out, _ = self.lstm(embedded)
        entropy_scores = self.sigmoid(self.entropy_head(lstm_out))
        return entropy_scores

    def calculate_sequence_entropy(self, node_sequence):
        """Calculate entropy for a sequence of nodes"""
        # Convert nodes to indices (simplified tokenization)
        node_to_idx = {node: i for i, node in enumerate(set(node_sequence))}
        sequence_indices = torch.tensor([node_to_idx[node] for node in node_sequence])

        if len(sequence_indices.shape) == 1:
            sequence_indices = sequence_indices.unsqueeze(0)  # Add batch dimension

        with torch.no_grad():
            entropy_scores = self.forward(sequence_indices)
        return entropy_scores.squeeze().numpy()


class GraphTraverser:
    """Implements graph traversal with entropy-based stopping"""

    def __init__(self, graph, entropy_model, threshold=0.7):
        self.graph = graph
        self.entropy_model = entropy_model
        self.threshold = threshold

    def traverse_from_node(self, start_node, max_steps=10):
        """Traverse graph from starting node until entropy threshold"""
        visited = set()
        path = [start_node]
        current_node = start_node

        for step in range(max_steps):
            visited.add(current_node)

            # Get unvisited neighbors
            neighbors = list(self.graph.neighbors(current_node))
            unvisited_neighbors = [n for n in neighbors if n not in visited]

            if not unvisited_neighbors:
                break

            # Calculate entropy
            if len(path) > 2:
                entropy_scores = self.entropy_model.calculate_sequence_entropy(path)
                current_entropy = np.mean(entropy_scores)

                if current_entropy > self.threshold:
                    print(f"🛑 Stopping traversal at entropy {current_entropy:.3f}")
                    break

            # Move to next node
            next_node = unvisited_neighbors[0]  # You could use [-1] or random.choice(...)
            path.append(next_node)
            current_node = next_node

        return path, visited


# ✅ Initialize model and traverser
entropy_model = EntropyModel()
traverser = GraphTraverser(knowledge_graph, entropy_model)

In [None]:
class NebulaGraphConnector:
    """Handles NebulaGraph database operations (simulated version)"""

    def __init__(self, host='127.0.0.1', port=9669, username='root', password='nebula'):
        # Note: This assumes NebulaGraph is running locally
        # For Colab, you'd need to set up NebulaGraph in Docker
        self.config = {
            'host': host,
            'port': port,
            'username': username,
            'password': password
        }
        self.connected = False

    def connect(self):
        """Connect to NebulaGraph (placeholder for actual connection)"""
        try:
            # In a real implementation, you'd use nebula3-python here
            print("🔌 NebulaGraph connection simulated (use Docker setup for real deployment)")
            self.connected = True
            return True
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            return False

    def create_schema(self):
        """Create graph schema"""
        schema_commands = [
            "CREATE SPACE IF NOT EXISTS sentence_kg(vid_type=FIXED_STRING(256));",
            "USE sentence_kg;",
            "CREATE TAG IF NOT EXISTS entity(name string);",
            "CREATE EDGE IF NOT EXISTS relation(verb string, sentence_id int);"
        ]
        print("📋 Schema creation simulated")
        return schema_commands

    def insert_graph_data(self, graph):
        """Insert graph data into NebulaGraph"""
        print(f"💾 Simulating insertion of {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
        # In a real implementation, you'd batch insert the data using NebulaGraph client
        return True


# ✅ Setup NebulaGraph connection (simulated)
nebula_conn = NebulaGraphConnector()
nebula_conn.connect()
nebula_conn.create_schema()
nebula_conn.insert_graph_data(knowledge_graph)

🔌 NebulaGraph connection simulated (use Docker setup for real deployment)
📋 Schema creation simulated
💾 Simulating insertion of 68 nodes and 50 edges


True

In [None]:
class BoundaryDetectionEvaluator:
    """Evaluates sentence boundary detection performance"""

    def __init__(self, kg_builder, sentences):
        self.kg_builder = kg_builder
        self.sentences = sentences
        self.ground_truth = self._create_ground_truth()

    def _create_ground_truth(self):
        """Create ground truth sentence groupings"""
        ground_truth = {}
        for node in self.kg_builder.graph.nodes():
            sentence_ids = list(self.kg_builder.node_to_sentences[node])
            ground_truth[node] = sentence_ids
        return ground_truth

    def evaluate_traversal(self, start_nodes, traverser):
        """Evaluate boundary detection performance"""
        predictions = {}
        for start_node in start_nodes:
            if start_node in self.kg_builder.graph.nodes():
                path, visited = traverser.traverse_from_node(start_node)
                predicted_sentences = set()
                for node in visited:
                    if node in self.kg_builder.node_to_sentences:
                        predicted_sentences.update(self.kg_builder.node_to_sentences[node])
                predictions[start_node] = list(predicted_sentences)
        return self._calculate_metrics(predictions)

    def _calculate_metrics(self, predictions):
        """Calculate F1, precision, recall"""
        all_true = []
        all_pred = []
        for node, predicted_sentences in predictions.items():
            if node in self.ground_truth:
                true_sentences = set(self.ground_truth[node])
                pred_sentences = set(predicted_sentences)
                max_sentence_id = max([
                    max(ids) if ids else 0
                    for ids in self.ground_truth.values()
                ])
                for sent_id in range(max_sentence_id + 1):
                    all_true.append(1 if sent_id in true_sentences else 0)
                    all_pred.append(1 if sent_id in pred_sentences else 0)

        if all_true and all_pred:
            precision, recall, f1, _ = precision_recall_fscore_support(
                all_true, all_pred, average='binary', zero_division=0
            )
            return {'precision': precision, 'recall': recall, 'f1': f1}
        else:
            return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


# ✅ Run evaluation
evaluator = BoundaryDetectionEvaluator(kg_builder, sentences)
start_nodes = list(knowledge_graph.nodes())[:5]  # Test with first 5 nodes
metrics = evaluator.evaluate_traversal(start_nodes, traverser)

print("📊 Evaluation Results:")
print(f" Precision: {metrics['precision']:.3f}")
print(f" Recall:    {metrics['recall']:.3f}")
print(f" F1-Score:  {metrics['f1']:.3f}")


📊 Evaluation Results:
 Precision: 0.724
 Recall:    1.000
 F1-Score:  0.840


In [None]:
def run_complete_pipeline():
    """Run the complete entropy-based boundary detection pipeline"""
    print("🚀 Starting Entropy-Based Sentence Boundary Detection Pipeline")
    print("=" * 60)

    # Step 1: Data Collection
    print("\n📥 Step 1: Data Collection")
    collector = DataCollector()
    collector.download_text()
    clean_text = collector.clean_text()

    # Step 2: SVO Extraction
    print("\n🔍 Step 2: SVO Triplet Extraction")
    extractor = SVOExtractor()
    triplets, sentences = extractor.process_text(clean_text)

    # Step 3: Knowledge Graph Construction
    print("\n🕸️ Step 3: Knowledge Graph Construction")
    kg_builder = KnowledgeGraphBuilder()
    knowledge_graph = kg_builder.build_graph(triplets, extractor.sentence_mapping)

    # Step 4: Entropy Model Training (simplified)
    print("\n🧠 Step 4: Entropy Model Initialization")
    entropy_model = EntropyModel()
    traverser = GraphTraverser(knowledge_graph, entropy_model)

    # Step 5: Boundary Detection
    print("\n🎯 Step 5: Boundary Detection")
    start_nodes = list(knowledge_graph.nodes())[:5]
    results = {}
    for node in start_nodes:
        path, visited = traverser.traverse_from_node(node)
        results[node] = {'path': path, 'visited': list(visited)}
        print(f" From '{node}': visited {len(visited)} nodes")

    # Step 6: Evaluation
    print("\n📊 Step 6: Evaluation")
    evaluator = BoundaryDetectionEvaluator(kg_builder, sentences)
    metrics = evaluator.evaluate_traversal(start_nodes, traverser)

    print(f"\n🎉 Final Results:")
    print(f" Nodes in Graph: {knowledge_graph.number_of_nodes()}")
    print(f" Edges in Graph: {knowledge_graph.number_of_edges()}")
    print(f" F1-Score: {metrics['f1']:.3f}")
    print(f" Precision: {metrics['precision']:.3f}")
    print(f" Recall: {metrics['recall']:.3f}")

    return {
        'graph': knowledge_graph,
        'metrics': metrics,
        'results': results,
        'sentences': sentences
    }

# Run the complete pipeline
pipeline_results = run_complete_pipeline()

🚀 Starting Entropy-Based Sentence Boundary Detection Pipeline

📥 Step 1: Data Collection
✅ Successfully downloaded War and Peace!

🔍 Step 2: SVO Triplet Extraction
🔍 Extracted 52 SVO triplets from 462 sentences

🕸️ Step 3: Knowledge Graph Construction
🕸️ Built graph with 68 nodes and 50 edges

🧠 Step 4: Entropy Model Initialization

🎯 Step 5: Boundary Detection
 From 'chapter xxiv chapter xxv chapter xxvi': visited 2 nodes
 From 'seven': visited 1 nodes
 From 'i': visited 3 nodes
 From 'you': visited 2 nodes
 From 'me': visited 1 nodes

📊 Step 6: Evaluation

🎉 Final Results:
 Nodes in Graph: 68
 Edges in Graph: 50
 F1-Score: 0.840
 Precision: 0.724
 Recall: 1.000
