<a href="https://colab.research.google.com/github/Sidhtang/dynamic-context-management-in-llms-/blob/main/adaptive_sliding_window_adaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from collections import deque
from typing import List, Dict, Any, Optional, Set
from dataclasses import dataclass
from datetime import datetime
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    pipeline,
    T5ForConditionalGeneration,
    T5Tokenizer
)
import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

@dataclass
class ContextChunk:
    """Represents a chunk of context with metadata"""
    text: str
    timestamp: datetime
    importance_score: float
    embedding: Optional[np.ndarray] = None
    summary: Optional[str] = None
    entities: Optional[List[Dict]] = None
    keywords: Optional[Set[str]] = None

class EnhancedNLPProcessor:
    def __init__(self):
        # Initialize SpaCy pipeline with custom settings
        self.nlp = spacy.load('en_core_web_sm')
        # Add custom entity patterns
        ruler = self.nlp.get_pipe("entity_ruler") if "entity_ruler" in self.nlp.pipe_names else self.nlp.add_pipe("entity_ruler")
        patterns = [
            {"label": "ORG", "pattern": "OpenAI"},
            {"label": "ORG", "pattern": "DeepMind"},
            {"label": "TECH", "pattern": "AI model"},
            {"label": "TECH", "pattern": "software library"}
        ]
        ruler.add_patterns(patterns)

        # Initialize transformers
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
        self.summarizer = T5ForConditionalGeneration.from_pretrained('t5-small').to(self.device)
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

    def extract_entities(self, text: str) -> List[Dict]:
        """Extract named entities with improved handling"""
        doc = self.nlp(text)
        entities = []
        seen = set()  # To prevent duplicate entities

        for ent in doc.ents:
            # Normalize entity text
            normalized_text = ent.text.strip()

            # Skip if we've seen this entity before
            if (normalized_text, ent.label_) in seen:
                continue

            # Add entity with additional context
            entities.append({
                'text': normalized_text,
                'label': ent.label_,
                'start_char': ent.start_char,
                'end_char': ent.end_char,
                'context': text[max(0, ent.start_char-20):min(len(text), ent.end_char+20)]
            })

            seen.add((normalized_text, ent.label_))

        return entities

    def generate_summary(self, text: str, max_length: int = 50) -> str:
        """Generate improved summary with better formatting"""
        # Add explicit summarization prompt
        prompt = "summarize concisely: " + text

        inputs = self.tokenizer.encode(prompt,
                                     return_tensors="pt",
                                     max_length=512,
                                     truncation=True).to(self.device)

        summary_ids = self.summarizer.generate(
            inputs,
            max_length=max_length,
            min_length=10,
            num_beams=4,
            length_penalty=2.0,  # Encourage slightly longer summaries
            no_repeat_ngram_size=2,  # Avoid repetition
            early_stopping=True
        )

        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Post-process summary
        summary = summary.strip()
        if not summary.endswith('.'):
            summary += '.'

        # Ensure first letter is capitalized
        summary = summary[0].upper() + summary[1:]

        return summary

class EnhancedAdaptiveSlidingWindow:
    def __init__(self, max_chunks: int = 5, similarity_threshold: float = 0.7, importance_threshold: float = 0.3):
        """
        Initialize the Enhanced Adaptive Sliding Window.

        Args:
            max_chunks (int): Maximum number of context chunks to maintain
            similarity_threshold (float): Threshold for determining similar content
            importance_threshold (float): Minimum importance score for retention
        """
        self.max_chunks = max_chunks
        self.similarity_threshold = similarity_threshold
        self.importance_threshold = importance_threshold
        self.context_chunks = deque(maxlen=max_chunks)
        self.nlp_processor = EnhancedNLPProcessor()

    def add_chunk(self, text: str) -> None:
        """Add a new chunk of text to the context window"""
        # Create embedding
        embedding = self.nlp_processor.sentence_transformer.encode([text])[0]

        # Calculate importance score (simplified version)
        importance_score = len(text.split()) / 100  # Basic score based on length

        # Extract entities
        entities = self.nlp_processor.extract_entities(text)

        # Generate summary
        summary = self.nlp_processor.generate_summary(text)

        # Create new chunk
        chunk = ContextChunk(
            text=text,
            timestamp=datetime.now(),
            importance_score=importance_score,
            embedding=embedding,
            summary=summary,
            entities=entities,
            keywords=set(word.lower() for word in text.split())
        )

        # Check similarity with existing chunks
        if self.context_chunks:
            similarities = [
                cosine_similarity(
                    [chunk.embedding],
                    [existing_chunk.embedding]
                )[0][0]
                for existing_chunk in self.context_chunks
            ]

            # If too similar to existing content, update importance scores
            if max(similarities) > self.similarity_threshold:
                return

        # Add new chunk
        self.context_chunks.append(chunk)

        # Remove least important chunks if over capacity
        while len(self.context_chunks) > self.max_chunks:
            min_importance = min(c.importance_score for c in self.context_chunks)
            if min_importance < self.importance_threshold:
                self.context_chunks.remove(min(
                    self.context_chunks,
                    key=lambda x: x.importance_score
                ))

    def get_important_entities(self) -> Dict[str, List[Dict]]:
        """Get all named entities with improved organization and context"""
        entities_by_type = {}

        for chunk in self.context_chunks:
            if chunk.entities:
                for entity in chunk.entities:
                    if entity['label'] not in entities_by_type:
                        entities_by_type[entity['label']] = []

                    # Add entity with frequency and context
                    existing = next((e for e in entities_by_type[entity['label']]
                                   if e['text'].lower() == entity['text'].lower()), None)

                    if existing:
                        existing['frequency'] += 1
                        if entity['context'] not in existing['contexts']:
                            existing['contexts'].append(entity['context'])
                    else:
                        entities_by_type[entity['label']].append({
                            'text': entity['text'],
                            'frequency': 1,
                            'contexts': [entity['context']]
                        })

        # Sort entities by frequency within each type
        for entity_type in entities_by_type:
            entities_by_type[entity_type].sort(key=lambda x: x['frequency'], reverse=True)

        return entities_by_type

    def get_context_summary(self) -> str:
        """Get improved context summary"""
        if not self.context_chunks:
            return ""

        # Collect all summaries with their importance scores
        summaries = [(chunk.summary, chunk.importance_score)
                    for chunk in self.context_chunks if chunk.summary]

        # Sort by importance score
        summaries.sort(key=lambda x: x[1], reverse=True)

        # Combine summaries, giving more weight to important ones
        combined_text = " ".join(summary for summary, _ in summaries)

        # Generate final summary
        return self.nlp_processor.generate_summary(combined_text)

def main():
    window = EnhancedAdaptiveSlidingWindow(
        max_chunks=5,
        similarity_threshold=0.7,
        importance_threshold=0.3
    )

    texts = [
        """The new AI model has achieved breakthrough performance on multiple
        benchmarks. Researchers at OpenAI and DeepMind contributed to this
        development.""",

        """Weather conditions remain stable with mild temperatures and clear
        skies.""",

        """Critical security vulnerability discovered in widely-used software
        library. Users urged to update immediately.""",

        """Project team meeting scheduled for tomorrow at 2 PM to discuss Q4
        deliverables and strategy."""
    ]

    for text in texts:
        print("\nProcessing new text:", text)
        window.add_chunk(text)

        print("\nCurrent context summary:")
        print(window.get_context_summary())

        print("\nImportant entities:")
        entities = window.get_important_entities()
        for entity_type, entity_list in entities.items():
            print(f"\n{entity_type}:")
            for entity in entity_list:
                print(f"- {entity['text']} (mentioned {entity['frequency']} times)")
                for context in entity['contexts']:
                    print(f"  Context: ...{context}...")

if __name__ == "__main__":
    main()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Processing new text: The new AI model has achieved breakthrough performance on multiple 
        benchmarks. Researchers at OpenAI and DeepMind contributed to this 
        development.

Current context summary:
The new AI model has achieved breakthrough performance on multiple benchmarks. researchers at OpenAI and DeepMind contributed to this development.

Important entities:

ORG:
- AI (mentioned 1 times)
  Context: ...The new AI model has achieved ...

GPE:
- OpenAI (mentioned 1 times)
  Context: ...rks. Researchers at OpenAI and DeepMind contri...

PRODUCT:
- DeepMind (mentioned 1 times)
  Context: ...chers at OpenAI and DeepMind contributed to this...

Processing new text: Weather conditions remain stable with mild temperatures and clear 
        skies.

Current context summary:
New AI model has achieved breakthrough performance on multiple benchmarks. weather conditions remain stable with mild temperatures and clear skies.

Important entities:

ORG:
- AI (mentioned 1 times)
  Co