<a href="https://colab.research.google.com/github/Sidhtang/dynamic-context-management-in-llms-/blob/main/hierachical_text_qualification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from typing import List, Dict, Any
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from sklearn.cluster import KMeans
from collections import Counter

class HierarchicalChunker:
    def __init__(self, model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        """Initialize the hierarchical chunker with a specified embedding model."""
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.levels = 3

    def _get_embeddings(self, sentences: List[str]) -> np.ndarray:
        """Generate embeddings for a list of sentences."""
        encoded = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**encoded)
        return outputs.last_hidden_state.mean(dim=1).numpy()

    def _extract_key_phrases(self, sentences: List[str], num_phrases: int = 5) -> List[str]:
        """Extract key phrases from sentences using word frequency."""
        words = []
        for sentence in sentences:
            words.extend([word.lower() for word in word_tokenize(sentence)
                        if word.isalnum() and len(word) > 3])

        word_freq = Counter(words)
        return [word for word, _ in word_freq.most_common(num_phrases)]

    def _create_level_summary(self, sentences: List[str], level: int) -> str:
        """Create a summary appropriate for the specified level."""
        if level == 1:
            # Level 1: Single most representative sentence + key concepts
            key_phrases = self._extract_key_phrases(sentences, 3)
            embeddings = self._get_embeddings(sentences)
            center = np.mean(embeddings, axis=0)
            distances = np.linalg.norm(embeddings - center, axis=1)
            main_sentence = sentences[np.argmin(distances)]
            return f"{main_sentence} Key concepts: {', '.join(key_phrases)}."

        elif level == 2:
            # Level 2: Topic-based summary using clustering
            num_sentences = max(3, len(sentences) // 4)
            embeddings = self._get_embeddings(sentences)
            kmeans = KMeans(n_clusters=num_sentences, random_state=42)
            clusters = kmeans.fit_predict(embeddings)

            # Select most central sentence from each cluster
            summary_sentences = []
            for i in range(num_sentences):
                cluster_mask = clusters == i
                if np.any(cluster_mask):
                    cluster_embeddings = embeddings[cluster_mask]
                    cluster_center = kmeans.cluster_centers_[i]
                    distances = np.linalg.norm(cluster_embeddings - cluster_center, axis=1)
                    cluster_sentences = np.array(sentences)[cluster_mask]
                    summary_sentences.append(cluster_sentences[np.argmin(distances)])

            return " ".join(summary_sentences)

        else:
            # Level 3: Detailed content organized by topics
            return sentences

    def chunk_text(self, text: str) -> Dict[str, Any]:
        """Break text into hierarchical chunks with different levels of abstraction."""
        sentences = sent_tokenize(text)

        hierarchy = {
            "level_1": self._create_level_summary(sentences, 1),
            "level_2": self._create_level_summary(sentences, 2),
            "level_3": sentences,
            "embeddings": self._get_embeddings(sentences)
        }

        return hierarchy

    def query_context(self, query: str, hierarchy: Dict[str, Any], level: int = None) -> str:
        """Query the hierarchical context at a specific level or automatically determine the best level."""
        query_embedding = self._get_embeddings([query])[0]

        if level is None:
            # Analyze query complexity and specificity
            words = word_tokenize(query.lower())
            question_words = set(['what', 'who', 'when', 'where', 'why', 'how'])
            specificity_words = set(['specifically', 'detail', 'explain', 'describe', 'elaborate'])

            if len(set(words) & question_words) == 1 and len(words) < 6:
                level = 1  # Simple, single-concept queries
            elif len(set(words) & specificity_words) > 0 or len(words) > 10:
                level = 3  # Detailed queries
            else:
                level = 2  # Medium complexity queries

        if level == 1:
            return hierarchy["level_1"]
        elif level == 2:
            return hierarchy["level_2"]
        else:
            # Find most relevant detailed chunks and organize them
            similarities = np.dot(hierarchy["embeddings"], query_embedding)
            top_indices = np.argsort(similarities)[-4:][::-1]  # Get top 4 most relevant chunks
            relevant_chunks = [hierarchy["level_3"][i] for i in top_indices]
            return " ".join(relevant_chunks)

def demonstrate_chunking():
    text = """
    Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from data.
    These systems can identify patterns and make decisions with minimal human intervention.
    The field has seen rapid growth in recent years due to increased data availability and computational power.
    Deep learning, a subset of machine learning, uses neural networks with many layers to process complex patterns.
    These neural networks are inspired by the human brain's structure and function.
    They can be trained on large datasets to perform tasks like image recognition, natural language processing, and game playing.
    The training process involves adjusting the network's parameters to minimize prediction errors.
    Modern applications include autonomous vehicles, recommendation systems, and medical diagnosis tools.
    Each application requires careful consideration of data quality, model architecture, and ethical implications.
    The future of machine learning promises even more advanced applications and integration with other technologies.
    """

    chunker = HierarchicalChunker()
    hierarchy = chunker.chunk_text(text)

    queries = {
        "simple": "What is machine learning?",
        "medium": "How does deep learning relate to machine learning?",
        "complex": "Explain the training process and applications of neural networks in detail."
    }

    results = {}
    for query_type, query in queries.items():
        results[query_type] = chunker.query_context(query, hierarchy)

    return results

if __name__ == "__main__":
    results = demonstrate_chunking()
    for query_type, response in results.items():
        print(f"\n{query_type.upper()} QUERY RESPONSE:")
        print(response)




SIMPLE QUERY RESPONSE:
Deep learning, a subset of machine learning, uses neural networks with many layers to process complex patterns. Key concepts: learning, machine, systems.

MEDIUM QUERY RESPONSE:
Deep learning, a subset of machine learning, uses neural networks with many layers to process complex patterns. Modern applications include autonomous vehicles, recommendation systems, and medical diagnosis tools. These systems can identify patterns and make decisions with minimal human intervention.

COMPLEX QUERY RESPONSE:
The training process involves adjusting the network's parameters to minimize prediction errors. Deep learning, a subset of machine learning, uses neural networks with many layers to process complex patterns. These neural networks are inspired by the human brain's structure and function. 
    Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from data.


In [6]:
def demonstrate_improved_chunking():
    # Sample text about machine learning and AI
    text = """
    Artificial Intelligence (AI) represents the broad field of making machines intelligent.
    Machine learning is a subset of AI that focuses on developing systems that can learn from data.
    These systems automatically identify patterns and make decisions with minimal human intervention.
    Deep learning is a specialized type of machine learning using neural networks with many layers.
    Neural networks are computing systems inspired by biological brains and their interconnected neurons.
    The training process involves feeding large amounts of data through these neural networks.
    As data flows through the network, weights and biases are adjusted to minimize prediction errors.
    Modern applications of deep learning include computer vision, natural language processing, and robotics.
    Computer vision systems can recognize objects, faces, and even emotions in images and videos.
    Natural language processing enables machines to understand and generate human language.
    Robotics applications combine perception, planning, and control for autonomous behavior.
    Ethical considerations in AI include privacy, bias, and the impact on employment.
    Data quality and model transparency are crucial for building trustworthy AI systems.
    The future of AI promises even more advanced applications across various industries.
    """

    chunker = HierarchicalChunker()
    hierarchy = chunker.chunk_text(text)

    # Different types of queries to demonstrate level differentiation
    queries = {
        "simple": "What is artificial intelligence?",
        "medium": "How does deep learning work with neural networks?",
        "complex": "Explain the applications and ethical considerations of deep learning in detail."
    }

    results = {}
    print("\n=== DEMONSTRATION OF HIERARCHICAL RESPONSES ===\n")

    for query_type, query in queries.items():
        print(f"\n--- {query_type.upper()} QUERY: '{query}' ---")

        # Show responses at each level explicitly
        for level in [1, 2, 3]:
            response = chunker.query_context(query, hierarchy, level)
            print(f"\nLEVEL {level} RESPONSE:")
            print(response)

        print("\n" + "="*50)

    return results

if __name__ == "__main__":
    demonstrate_improved_chunking()




=== DEMONSTRATION OF HIERARCHICAL RESPONSES ===


--- SIMPLE QUERY: 'What is artificial intelligence?' ---

LEVEL 1 RESPONSE:

    Artificial Intelligence (AI) represents the broad field of making machines intelligent. Key concepts: systems, learning, data.

LEVEL 2 RESPONSE:
Neural networks are computing systems inspired by biological brains and their interconnected neurons. The future of AI promises even more advanced applications across various industries. Machine learning is a subset of AI that focuses on developing systems that can learn from data.

LEVEL 3 RESPONSE:

    Artificial Intelligence (AI) represents the broad field of making machines intelligent. Machine learning is a subset of AI that focuses on developing systems that can learn from data. Deep learning is a specialized type of machine learning using neural networks with many layers. Neural networks are computing systems inspired by biological brains and their interconnected neurons.


--- MEDIUM QUERY: 'How does dee