## Loading Dependencies

In [3]:
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from langchain_google_vertexai import VertexAIEmbeddings
import vertexai
import langchain
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken
import voyageai
from tqdm import tqdm
import pyarrow.parquet as pq
import spacy
from torch.utils.data import DataLoader, Dataset
from vertexai.language_models import TextEmbeddingModel
try:
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\mkolla1\LawChatBot\gcpservicekey.json"
except:
    print("Error at Try block")
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"gcpservicekey.json"
    
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)


voyageai.api_key = os.getenv("VOYAGE_API")

# Load NLP pipeline for query analysis
nlp = spacy.load("en_core_web_lg")

# Legal dictionary terms - expand as needed
LEGAL_TERMS = {
    "habeas corpus", "mens rea", "actus reus", "stare decisis", 
    "prima facie", "de novo", "res judicata", "certiorari",
    "statutory", "U.S.C.", "CFR", "jurisdiction", "adjudicate"
}

# Regex patterns for legal citations
CITATION_PATTERNS = [
    r'\d+\s+U\.S\.C\.\s+§*\s*\d+',  # US Code
    r'\d+\s+C\.F\.R\.\s+§*\s*\d+',   # Code of Federal Regulations
    r'[A-Za-z]+\s+v\.\s+[A-Za-z]+',  # Case names
]


## Embedding Generator for Gemini and Voyage

In [4]:
class EmbeddingGenerator:
    def __init__(self, gemini_model_name="text-embedding-005", voyage_model_name="voyage-law-2"):
        """
        Initializes the embedding generator with Gemini and VoyageAI models.
        """
        self.gemini_model = VertexAIEmbeddings(gemini_model_name)
        self.voyage_model_name = voyage_model_name
        self.voyage_client = voyageai.Client()

    def get_embeddings_gemini(self, texts, batch_size=32):
        """
        Compute embeddings using VertexAIEmbeddings in batches.

        Args:
            texts (list of str): List of text data to embed.
            batch_size (int): Number of texts to process per batch.

        Returns:
            list: List of embedding vectors.
        """
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
            batch = texts[i:i + batch_size]  # Get batch of texts
            batch_embeddings = self.gemini_model.embed_documents(batch)  # Generate embeddings
            embeddings.extend(batch_embeddings)  # Store results

        return embeddings

    def get_embeddings_voyage(self, texts, batch_size=32):
        """
        Compute embeddings using the VoyageAI Python client in batches.

        Args:
            texts (list of str): List of text data to embed.
            batch_size (int): Number of texts per batch.

        Returns:
            list: List of embedding vectors.
        """
        embeddings = []
        texts = [str(text) for text in texts]  

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size] 
            
            try:
                response = self.voyage_client.embed(batch, model=self.voyage_model_name)
                batch_embeddings = response.embeddings  
                embeddings.extend(batch_embeddings)
            except Exception as e:
                print(f"Error processing batch {i // batch_size + 1}: {e}")

        return embeddings


## Saved Embeddings Loader

In [5]:
class LegalEmbeddingLoader:
    """Loads embeddings from parquet files for both models and all granularity levels."""
    
    def __init__(self, base_path):
        self.base_path = base_path
        self.gemini_embeddings = {}
        self.voyager_embeddings = {}
        self.metadata = {}
        
    def load_embeddings(self):
        """Load the six specified embedding files."""
        file_mappings = {
            "gemini_chapters": "embeddings_gemini_text-005_chapters_semchunk.parquet",
            "voyager_chapters": "embeddings_voyage_per_chapter_semchunked.parquet",
            "gemini_pages": "embeddings_gemini_text-005_pages_semchunk.parquet",
            "voyager_pages": "embeddings_voyage_per_pages_semchunked.parquet",
            "gemini_sections": "embeddings_gemini_text-005.parquet",
            "voyager_sections": "embeddings_voyage.parquet",
        }

        for key, file_name in file_mappings.items():
            print(self.base_path)
            file_path = os.path.join(self.base_path, key.split("_")[-1], file_name)
            print(file_path)
            if not os.path.exists(file_path):
                print(f"File {file_name} not found. Skipping...")
                continue

            # Read parquet file
            table = pq.read_table(file_path)
            df = table.to_pandas()
            print(f"\nColumns in {file_name}: {df.columns.tolist()}")
            # Extract embeddings and metadata
            embeddings = np.stack(df["Embedding"].values)

            # Determine model and granularity
            model, granularity = key.split("_")

            # Store embeddings
            if model == "gemini":
                self.gemini_embeddings[granularity] = torch.tensor(embeddings, dtype=torch.float32)
            else:
                self.voyager_embeddings[granularity] = torch.tensor(embeddings, dtype=torch.float32)

            # Store metadata
            self.metadata[key] = df.drop('Embedding', axis=1)

            print(f"Loaded {file_name} ({model} - {granularity})")
        return self.gemini_embeddings, self.voyager_embeddings, self.metadata

    def get_embedding_dimensions(self):
        """Return the dimensions of embeddings for both models."""
        gemini_dim = {k: v.shape[1] for k, v in self.gemini_embeddings.items()}
        voyager_dim = {k: v.shape[1] for k, v in self.voyager_embeddings.items()}
        return gemini_dim, voyager_dim

In [6]:
# # Assuming LegalEmbeddingLoader class has been defined as in the code you provided
# loader = LegalEmbeddingLoader(base_path="New_Embeddings_2025")
# gemini_embeddings, voyager_embeddings, metadata = loader.load_embeddings()

# # Get the embedding dimensions
# gemini_dim, voyager_dim = loader.get_embedding_dimensions()

# # Print the dimensions for both models
# print("Gemini Embedding Dimensions:")
# print(gemini_dim)

# print("\nVoyager Embedding Dimensions:")
# print(voyager_dim)


## Query Analyser and Intent Recognization

In [10]:
class LegalQueryAnalyzer:
    """Analyzes legal queries to determine intent and model weights."""
    
    def __init__(self, legal_terms=LEGAL_TERMS, citation_patterns=CITATION_PATTERNS):
        self.legal_terms = legal_terms
        self.citation_patterns = citation_patterns
        self.embedder = EmbeddingGenerator()
        
    def analyze_query(self, query):
        """
        Analyze query characteristics to determine model weights.
        Returns a dictionary of features and recommended weights.
        """
        # Process with spaCy
        doc = nlp(query)
        
        # Feature extraction
        features = {
            'legal_term_density': self._calculate_legal_term_density(query),
            'citation_count': self._count_citations(query),
            'structural_complexity': self._assess_complexity(doc),
            'query_length': len(doc),
            'jurisdiction_signals': self._detect_jurisdiction(doc)
        }
        
        # Calculate recommended weights
        weights = self._determine_weights(features)
        
        return {
            'features': features,
            'weights': weights,
            'gemini_embedding': self.embedder.get_embeddings_gemini([query]),
            'voyage_embedding': self.embedder.get_embeddings_voyage([query])
        }
    
    def _calculate_legal_term_density(self, query):
        """Calculate the density of legal terminology in the query."""
        # Normalize and tokenize query
        query_lower = query.lower()
        total_tokens = len(query_lower.split())
        
        # Count legal terms
        legal_term_count = sum(1 for term in self.legal_terms if term.lower() in query_lower)
        
        # Calculate density
        if total_tokens > 0:
            return (legal_term_count / total_tokens) * 100
        return 0
    
    def _count_citations(self, query):
        """Count legal citations in the query."""
        citation_count = 0
        for pattern in self.citation_patterns:
            citation_count += len(re.findall(pattern, query))
        return citation_count
    
    def _assess_complexity(self, doc):
        """
        Assess the structural complexity of the query.
        Returns a score from 0-1 based on:
        - Number of clauses
        - Presence of legal conditionals
        - Sentence structure complexity
        """
        # Count clauses
        clause_markers = ["if", "when", "whether", "notwithstanding", "provided that"]
        clause_count = sum(1 for token in doc if token.text.lower() in clause_markers)
        
        # Check for complex legal conditionals
        has_conditionals = any(cm in doc.text.lower() for cm in clause_markers)
        
        # Assess syntactic complexity (simplified)
        depth = max((token.dep_.count('_') for token in doc), default=0)
        
        # Calculate complexity score (0-1)
        complexity = min(1.0, (clause_count * 0.2) + (0.3 if has_conditionals else 0) + (depth * 0.1))
        
        return complexity
    
    def _detect_jurisdiction(self, doc):
        """
        Detect jurisdictional signals in the query.
        Returns a dictionary of jurisdictional features.
        """
        # Look for jurisdictional entities
        jurisdictions = {
            'federal': 0,
            'state': 0,
            'international': 0,
            'specific_court': None
        }
        
        # Check for federal signals
        federal_terms = ["federal", "U.S.", "United States", "SCOTUS", "Supreme Court"]
        jurisdictions['federal'] = any(term.lower() in doc.text.lower() for term in federal_terms)
        
        # Check for state signals
        state_names = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", 
    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", 
    "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", 
    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", 
    "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", 
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]
  # Add all states
        jurisdictions['state'] = any(state in doc.text for state in state_names)
        
        # Check for international signals
        international_terms = ["international", "foreign", "treaty", "convention"]
        jurisdictions['international'] = any(term.lower() in doc.text.lower() for term in international_terms)
        
        # Look for specific courts
        court_patterns = ["Circuit", "District Court", "Supreme Court"]
        for pattern in court_patterns:
            if pattern in doc.text:
                jurisdictions['specific_court'] = pattern
                break
                
        return jurisdictions
    
    def _determine_weights(self, features):
        """
        Determine the optimal weights for each model based on features.
        Uses a rule-based approach initially, could be replaced with ML model.
        """
        # Default weights slightly favor specialized model
        gemini_weight = 0.4
        voyager_weight = 0.6
        
        # Adjust for legal density and citations
        if features['legal_term_density'] > 5 or features['citation_count'] > 0:
            # Increase weight for legal model
            voyager_weight += 0.15
            gemini_weight -= 0.15
        
        # Adjust for complexity
        if features['structural_complexity'] > 0.7:
            voyager_weight += 0.1
            gemini_weight -= 0.1
        
        # Adjust for jurisdictional specificity
        if features['jurisdiction_signals']['specific_court']:
            voyager_weight += 0.1
            gemini_weight -= 0.1
        
        # Ensure weights are valid
        voyager_weight = min(max(voyager_weight, 0.1), 0.9)
        gemini_weight = 1.0 - voyager_weight
        
        return {
            'gemini': gemini_weight,
            'voyager': voyager_weight
        }


In [11]:
# Testing the LegalQueryAnalyzer class
query_output= LegalQueryAnalyzer().analyze_query("What is the legal definition of mens rea?")
print(query_output.keys())
print(f" Shape of Gemini Query Embedding: {np.array(query_output['gemini_embedding']).shape}")
print(f" Shape of Gemini Query Embedding after Unsqeeze: {torch.tensor(query_output['gemini_embedding']).unsqueeze(0).shape}")
print(f" Shape of Gemini Query Embedding after Sqeeze: {torch.tensor(query_output['gemini_embedding']).squeeze(0).shape}")
print(f" Shape of Voyage Query Embedding: {np.array(query_output['voyage_embedding']).shape}")
print(query_output)


Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.94it/s]


dict_keys(['features', 'weights', 'gemini_embedding', 'voyage_embedding'])
 Shape of Gemini Query Embedding: (1, 768)
 Shape of Gemini Query Embedding after Unsqeeze: torch.Size([1, 1, 768])
 Shape of Gemini Query Embedding after Sqeeze: torch.Size([768])
 Shape of Voyage Query Embedding: (1, 1024)
{'features': {'legal_term_density': 12.5, 'citation_count': 0, 'structural_complexity': 0.0, 'query_length': 9, 'jurisdiction_signals': {'federal': False, 'state': False, 'international': False, 'specific_court': None}}, 'weights': {'gemini': 0.25, 'voyager': 0.75}, 'gemini_embedding': [[-0.0813518688082695, -0.03374248370528221, 0.025017933920025826, -0.01886814273893833, -0.003584230085834861, 0.04847944900393486, -0.022799337282776833, 0.07440640032291412, 0.022810891270637512, 0.010746952146291733, 0.0031785459723323584, -0.041993673890829086, -0.062119387090206146, -0.08199410885572433, -0.027054833248257637, -0.024165762588381767, 0.010855771601200104, -0.036382801830768585, 0.01995738

## Hierarichial Fusion

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class HierarchicalFusion(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=512, top_k=5):
        super(HierarchicalFusion, self).__init__()
        self.top_k = top_k
        
        # Projection layers for each granularity into a common hidden space.
        self.proj_section = nn.Linear(embedding_dim, hidden_dim)
        self.proj_page    = nn.Linear(embedding_dim, hidden_dim)
        self.proj_chapter = nn.Linear(embedding_dim, hidden_dim)
        
        # Projection for the query (optional, here we project to the same hidden space)
        self.proj_query   = nn.Linear(embedding_dim, hidden_dim)
        
        # Fusion layers:
        # First, fuse sections and pages (after top-k selection)
        self.fusion_sp    = nn.Linear(hidden_dim * 2, hidden_dim)
        # Next, fuse the above result with chapters
        self.fusion_all   = nn.Linear(hidden_dim * 2, hidden_dim)
        
        # Decoder layer to project the final fused vector back to original embedding space.
        self.decoder      = nn.Linear(hidden_dim, embedding_dim)
    
    def forward(self, sections, pages, chapters, query):
        """
        Args:
            sections: Tensor of shape (num_sections, embedding_dim)
            pages:    Tensor of shape (num_pages, embedding_dim)
            chapters: Tensor of shape (num_chapters, embedding_dim)
            query:    Tensor of shape (1, embedding_dim)
        
        Returns:
            decoded: Fused representation (1, embedding_dim)
            info: Dictionary containing scores and top-k indices for each level.
        """
        # Project embeddings into a common hidden dimension
        sec_proj   = F.relu(self.proj_section(sections))  # (num_sections, hidden_dim)
        page_proj  = F.relu(self.proj_page(pages))          # (num_pages, hidden_dim)
        chap_proj  = F.relu(self.proj_chapter(chapters))      # (num_chapters, hidden_dim)
        query_proj = F.relu(self.proj_query(query))          # (1, hidden_dim)
        
        print(f" Shape of Section Projection: {sec_proj.shape}")
        print(f" Shape of Page Projection: {page_proj.shape}")
        print(f" Shape of Chapter Projection: {chap_proj.shape}")
        print(f" Shape of Query Projection: {query_proj.shape}")
        # Compute similarity scores using dot product.
        # (Alternatively, you could use cosine similarity.)
        sec_scores   = torch.matmul(sec_proj, query_proj.T)   # (num_sections, 1)
        page_scores  = torch.matmul(page_proj, query_proj.T)    # (num_pages, 1)
        chap_scores  = torch.matmul(chap_proj, query_proj.T)    # (num_chapters, 1)
        
        print(f" Shape of Section Scores: {sec_scores.shape}")
        print(f" Shape of Page Scores: {page_scores.shape}")    
        print(f" Shape of Chapter Scores: {chap_scores.shape}")

        # Select top-k rows based on the similarity scores (squeeze to remove last dim)
        _, sec_top_idx  = torch.topk(sec_scores.squeeze(), self.top_k)
        _, page_top_idx = torch.topk(page_scores.squeeze(), self.top_k)
        _, chap_top_idx = torch.topk(chap_scores.squeeze(), self.top_k)
        

        # Gather the top-k embeddings from each level
        sec_top  = sec_proj[sec_top_idx]   # (top_k, hidden_dim)
        page_top = page_proj[page_top_idx]  # (top_k, hidden_dim)
        chap_top = chap_proj[chap_top_idx]  # (top_k, hidden_dim)
        
        print(f" Shape of Section Top: {sec_top.shape}")
        print(f" Shape of Page Top: {page_top.shape}")  
        print(f" Shape of Chapter Top: {chap_top.shape}")
        
        # Hierarchical fusion:
        # 1. Fuse sections and pages by aggregating (here, we use the mean)
        sec_fused  = sec_top.mean(dim=0, keepdim=True)    # (1, hidden_dim)
        page_fused = page_top.mean(dim=0, keepdim=True)    # (1, hidden_dim)
        sp_concat  = torch.cat([sec_fused, page_fused], dim=-1)  # (1, hidden_dim * 2)
        sp_fused   = F.relu(self.fusion_sp(sp_concat))    # (1, hidden_dim)
        
        # 2. Fuse the above with chapters
        chap_fused = chap_top.mean(dim=0, keepdim=True)    # (1, hidden_dim)
        all_concat = torch.cat([sp_fused, chap_fused], dim=-1)   # (1, hidden_dim * 2)
        all_fused  = F.relu(self.fusion_all(all_concat))   # (1, hidden_dim)
        
        # Decode back to the original embedding dimension.
        decoded = self.decoder(all_fused)   # (1, embedding_dim)
        
        # Pack additional information (like scores and indices) if you need them for debugging or retrieval.
        info = {
            'sec_scores': sec_scores,
            'page_scores': page_scores,
            'chap_scores': chap_scores,
            'sec_top_idx': sec_top_idx,
            'page_top_idx': page_top_idx,
            'chap_top_idx': chap_top_idx
        }
        
        return decoded, info

# Example usage:
if __name__ == "__main__":
    # Dummy embeddings for demonstration
    query= """ Whoever harbors or conceals any person who he knows, or has reasonable grounds to believe or suspect, has committed, or is about to commit, an offense under this section, shall be fined under this title or imprisoned not more than ten years, or both. """
    loader= LegalEmbeddingLoader("New_Embeddings_2025")
    gemini_embeddings, voyager_embeddings, metadata = loader.load_embeddings()
    print(metadata.keys())

    query_embedding= EmbeddingGenerator().get_embeddings_gemini([query])
    print(f" Shape of Gemini Query Embedding: {torch.tensor(query_embedding).shape}")

    sections = gemini_embeddings['sections']  # e.g., 1663 section embeddings
    pages    = gemini_embeddings['pages']  # e.g., 2176 page embeddings
    chapters = gemini_embeddings['chapters']  # e.g., 809 chapter embeddings
    query    = torch.tensor(query_embedding)     # Query embedding
    
    # Initialize the model (you can adjust hidden_dim and top_k as needed)
    model = HierarchicalFusion(embedding_dim=768, hidden_dim=512, top_k=5)
    
    # Forward pass through the network
    fused_embedding, details = model(sections, pages, chapters, query)
    
    print("Fused embedding shape:", fused_embedding.shape)  # Expect (1, 768)
    print("Top k section indices:", details['sec_top_idx'])
    print("Top k page indices:", details['page_top_idx'])
    print("Top k chapter indices:", details['chap_top_idx'])


New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet

Columns in embeddings_gemini_text-005_chapters_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_chapters_semchunk.parquet (gemini - chapters)
New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet

Columns in embeddings_voyage_per_chapter_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_chapter_semchunked.parquet (voyager - chapters)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet

Columns in embeddings_gemini_text-005_pages_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_pages_semchunk.parquet (gemini - pages)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet

Columns in embeddings_voyage_per_pages_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_pages_se

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.23it/s]

 Shape of Gemini Query Embedding: torch.Size([1, 768])
 Shape of Section Projection: torch.Size([1647, 512])
 Shape of Page Projection: torch.Size([2176, 512])
 Shape of Chapter Projection: torch.Size([809, 512])
 Shape of Query Projection: torch.Size([1, 512])
 Shape of Section Scores: torch.Size([1647, 1])
 Shape of Page Scores: torch.Size([2176, 1])
 Shape of Chapter Scores: torch.Size([809, 1])
 Shape of Section Top: torch.Size([5, 512])
 Shape of Page Top: torch.Size([5, 512])
 Shape of Chapter Top: torch.Size([5, 512])
Fused embedding shape: torch.Size([1, 768])
Top k section indices: tensor([461, 142, 677, 953, 494])
Top k page indices: tensor([2033, 1385, 1959, 1027, 1904])
Top k chapter indices: tensor([524, 177, 518, 181, 525])





In [113]:
import pandas as pd

# Load the embeddings files
gemini_sections_df = pd.read_parquet(r"New_Embeddings_2025\sections\embeddings_gemini_text-005.parquet")
gemini_pages_df = pd.read_parquet(r"New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet")
gemini_chapters_df = pd.read_parquet(r"New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet")

voyage_sections_df = pd.read_parquet(r"New_Embeddings_2025\sections\embeddings_voyage.parquet")
voyage_pages_df = pd.read_parquet(r"New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet")
voyage_chapters_df = pd.read_parquet(r"New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet")

def get_processed_content_by_index(index, source, model):
    """
    Retrieve the Processed_Content or chunk based on the specified index and source.

    Args:
        index (int): The row index to retrieve.
        source (str): The source dataset ("gemini_text", "gemini_pages", or "voyage").

    Returns:
        str: The corresponding Processed_Content or chunk.
    """
    if model== "gemini":
        if source == "sections":
            return gemini_sections_df.loc[index, "Processed_Content"] if index in gemini_sections_df.index else None
        elif source == "pages":
            return gemini_pages_df.loc[index, "chunk"] if index in gemini_pages_df.index else None
        elif source == "chapters":
            return gemini_chapters_df.loc[index, "chunk"] if index in gemini_chapters_df.index else None
        else:
            return "Invalid source specified."
    if model == "voyage":
        if source == "sections":
            return voyage_sections_df.loc[index, "Processed_Content"] if index in voyage_sections_df.index else None
        elif source == "pages":
            return voyage_pages_df.loc[index, "chunk"] if index in voyage_pages_df.index else None
        elif source == "chapters":
            return voyage_chapters_df.loc[index, "chunk"] if index in voyage_chapters_df.index else None
        else:
            return "Invalid source specified."

# Example Usage:
index_to_fetch = 456  # Change this index based on your needs
print(get_processed_content_by_index(index_to_fetch, source="sections", model= "gemini"))  # For Gemini Text
print("-----------------------------------------------")
print(get_processed_content_by_index(index_to_fetch, source="pages", model= "gemini"))  # For Gemini Pages
print("-----------------------------------------------")
print(get_processed_content_by_index(index_to_fetch, source="chapters", model= "gemini"))  # For Voyage


In all cases of murder or manslaughter, the offense shall be deemed to have been committed at the place where the injury was inflicted, or the poison administered or other means employed which caused the death, without regard to the place where the death occurs. (June 25, 1948, ch. 645, 62 Stat. 826.)

Historical and Revision Notes
Based on title 18, U.S.C., 1940 ed., §553 (Mar. 4, 1909, ch. 321, §336, 35 Stat. 1152).

-----------------------------------------------
1996—Subsec. (a). Pub. L. 104–132, §502(1)(A), substituted "nuclear material or nuclear byproduct
material" for "nuclear material" wherever appearing.
Subsec. (a)(1)(A). Pub. L. 104–132, §502(1)(B)(i), inserted "or to the environment" after "damage to
property".
Subsec. (a)(1)(B). Pub. L. 104–132, §502(1)(B)(ii), amended subpar. (B) generally. Prior to amendment,
subpar. (B) read as follows: "knows that circumstances exist which are likely to cause the death of or serious
bodily injury to any person or substantial damage to

## Simple Cosine Similiarity

In [74]:
import torch
import torch.nn.functional as F

class LegalEmbeddingMatcher:
    def __init__(self, gemini_embeddings, voyager_embeddings):
        """
        Initialize with embeddings for Gemini and Voyager models.
        :param gemini_embeddings: Dictionary with keys 'chapters', 'pages', 'sections' -> Tensor embeddings
        :param voyager_embeddings: Dictionary with keys 'chapters', 'pages', 'sections' -> Tensor embeddings
        """
        self.gemini_embeddings = {k: torch.tensor(v) for k, v in gemini_embeddings.items()}
        self.voyager_embeddings = {k: torch.tensor(v) for k, v in voyager_embeddings.items()}
        
        # Ensure embedding dimensions match using a linear transformation
        gemini_dim = self.gemini_embeddings['chapters'].shape[1]
        voyager_dim = self.voyager_embeddings['chapters'].shape[1]
        
        self.transform = torch.nn.Linear(voyager_dim, gemini_dim)

        # Transform all Voyager embeddings at initialization
        self.voyager_embeddings = {
            k: self.transform(v) for k, v in self.voyager_embeddings.items()
        }

    def match_query(self, query_embedding, model_weights):
        """
        Match query embedding against stored embeddings separately for each model.
        :param query_embedding: Dictionary with query embeddings from Gemini and Voyager.
        :param model_weights: Dictionary with weights for each model {'gemini': w1, 'voyager': w2}.
        :return: Top 10 matches across all granularities with cosine similarity scores.
        """
        query_gemini = torch.tensor(query_embedding['gemini'])
        query_voyager = torch.tensor(query_embedding['voyager'])

        # Transform Voyager query embedding to match Gemini's dimension
        query_voyager = self.transform(query_voyager)

        similarities = []  # Store (granularity, model, index, weighted similarity)

        for level in ['chapters', 'pages', 'sections']:
            gemini_sim = F.cosine_similarity(query_gemini, self.gemini_embeddings[level]) * model_weights['gemini']
            voyager_sim = F.cosine_similarity(query_voyager, self.voyager_embeddings[level]) * model_weights['voyager']

            # Collect all similarities with their granularity and model
            similarities.extend([
                (level, 'gemini', idx.item(), gemini_sim[idx].item()) for idx in range(len(gemini_sim))
            ])
            similarities.extend([
                (level, 'voyager', idx.item(), voyager_sim[idx].item()) for idx in range(len(voyager_sim))
            ])

        # Sort by similarity score in descending order and return top 10
        top_matches = sorted(similarities, key=lambda x: x[3], reverse=True)[:10]

        return top_matches


In [75]:
# Example usage:

loader = LegalEmbeddingLoader('New_Embeddings_2025')
gemini_embeddings, voyager_embeddings, metadata = loader.load_embeddings()

# Analyze query and get model weights
query_analyzer=LegalQueryAnalyzer()
query="""
willfully uses the mails for the mailing, carriage in the mails, or delivery of any sexually oriented advertisement in violation of section 3010 of title 39, or willfully violates any regulations of the Board of Governors issued under such section; or

(2) sells, leases, rents, lends, exchanges, or licenses the use of, or, except for the purpose expressly authorized by section 3010 of title 39, uses a mailing list maintained by the Board of Governors under such section;

shall be fined under this title or imprisoned not more than five years, or both, for the first offense, and shall be fined under this title or imprisoned not more than ten years, or both, for any second or subsequent offense.
"""

query_output=query_analyzer.analyze_query(query)


query_embedding = { 'gemini': query_output['gemini_embedding'], 'voyager': query_output['voyage_embedding'] }
model_weights = query_output['weights']
print(gemini_embeddings.keys())
matcher = LegalEmbeddingMatcher(gemini_embeddings, voyager_embeddings)
results = matcher.match_query(query_embedding, model_weights)
print(results)

New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet

Columns in embeddings_gemini_text-005_chapters_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_chapters_semchunk.parquet (gemini - chapters)
New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet

Columns in embeddings_voyage_per_chapter_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_chapter_semchunked.parquet (voyager - chapters)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet

Columns in embeddings_gemini_text-005_pages_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_pages_semchunk.parquet (gemini - pages)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet

Columns in embeddings_voyage_per_pages_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_pages_se

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


dict_keys(['chapters', 'pages', 'sections'])


  self.gemini_embeddings = {k: torch.tensor(v) for k, v in gemini_embeddings.items()}
  self.voyager_embeddings = {k: torch.tensor(v) for k, v in voyager_embeddings.items()}


AttributeError: 'int' object has no attribute 'item'

## Multi Level Attention Mechanism

In [None]:
# class MultiLevelAttention(nn.Module):
#     """
#     Enhanced attention mechanism with separate query embeddings for Gemini and Voyager.
#     """
    
#     def __init__(self, output_dim=1024):
#         super(MultiLevelAttention, self).__init__()
        
#         # Model dimensions
#         self.gemini_dim = 768
#         self.voyager_dim = 1024
#         self.output_dim = output_dim
#         self.granularities = ['sections', 'chapters', 'pages']
#         self.cross_attention_weights = {}

#         # Query projectors
#         self.gemini_query_projector = nn.Linear(768, output_dim)
#         self.voyager_query_projector = nn.Linear(1024, output_dim)
        
#         # Document projectors
#         self.gemini_projector = nn.Linear(self.gemini_dim, output_dim)
#         self.voyager_projector = nn.Linear(self.voyager_dim, output_dim)
        
#         # Aggregation
#         self.aggregation_layer = nn.Linear(output_dim, output_dim)
#         self.layer_norm = nn.LayerNorm(output_dim)

#     def forward(self, gemini_embeddings, voyager_embeddings, gemini_query_embedding, voyager_query_embedding, weights):
#         # Project queries
#         gemini_query_projected = self.gemini_query_projector(gemini_query_embedding)
#         voyager_query_projected = self.voyager_query_projector(voyager_query_embedding)
        
#         granularity_results = {}
#         diagnostics = {}
        
#         # Store granularities
#         gemini_granularities = []
#         voyager_granularities = []


#         for granularity in self.granularities:
#             print(f"Processing granularity: {granularity}")
#             gemini_emb = gemini_embeddings[granularity]
#             voyager_emb = voyager_embeddings[granularity]
#             print(f"Shape of Gemini Embedding: {gemini_emb.shape}")
#             print(f"Shape of Voyager Embedding: {voyager_emb.shape}")
#             print("--------------------------------------------------------")

            
#             # Project document embeddings
#             gemini_proj = self.gemini_projector(gemini_emb)
#             voyager_proj = self.voyager_projector(voyager_emb)
#             print(f"Shape of Gemini Projected Embedding: {gemini_proj.shape}")
#             print(f"Shape of Voyager Projected Embedding: {voyager_proj.shape}")
#             print("--------------------------------------------------------")

#             # # Ensure correct size
#             # gemini_proj = gemini_proj[:, :self.output_dim]  # Ensure correct size
#             # voyager_proj = voyager_proj[:, :self.output_dim]
#             # print(f"Shape of Gemini Projected Embedding After Ensuring: {gemini_proj.shape}")
#             # print(f"Shape of Voyager Projected Embedding After Ensuring: {voyager_proj.shape}")
#             # print("--------------------------------------------------------")

#             print(f"Shape of Gemini Query Embedding: {gemini_query_projected.shape}")
#             print(f"Shape of Voyager Query Embedding: {voyager_query_projected.shape}")
#             print("--------------------------------------------------------")
#             # Calculate similarities
#             gemini_similarity = F.cosine_similarity(
#                 gemini_proj,
#                 gemini_query_projected.expand_as(gemini_proj),
#                 dim=1
#             ).unsqueeze(0) # Add batch dimension for softmax
            
#             voyager_similarity = F.cosine_similarity(
#                 voyager_proj,
#                 voyager_query_projected.expand_as(voyager_proj),
#                 dim=1
#             ).unsqueeze(0)
#             print(f"Shape of Gemini Similarity: {gemini_similarity.shape}")
#             print(f"Shape of Voyager Similarity: {voyager_similarity.shape}")
#             print("--------------------------------------------------------")
            
#             # Normalize similarities
#             gemini_prob_weights = F.softmax(gemini_similarity, dim=0)
#             voyager_prob_weights = F.softmax(voyager_similarity, dim=0)
#             print(f"Shape of Gemini Weights After Softmax: {gemini_prob_weights.shape}")
#             print(f"Shape of Voyager Weights After Softmax: {voyager_prob_weights.shape}")
#             print("--------------------------------------------------------")


#             # Weighted sum with proper dimensions
#             # gemini_weighted = torch.matmul(gemini_prob_weights, gemini_proj)
#             # voyager_weighted = torch.matmul(voyager_prob_weights, voyager_proj)
#             gemini_weighted = (gemini_prob_weights.T * gemini_proj).sum(dim=1, keepdim=True).T
#             voyager_weighted = (voyager_prob_weights.T * voyager_proj).sum(dim=1, keepdim=True).T

#             print(f"Shape of Gemini Weighted Embedding: {gemini_weighted.shape}")
#             print(f"Shape of Voyager Weighted Embedding: {voyager_weighted.shape}")
#             print("--------------------------------------------------------")

#             # Dimension alignment
#             max_dim = max(gemini_weighted.shape[-1], voyager_weighted.shape[-1])
#             gemini_weighted = F.pad(gemini_weighted, (0, max_dim - gemini_weighted.shape[-1]))
#             voyager_weighted = F.pad(voyager_weighted, (0, max_dim - voyager_weighted.shape[-1]))

#             # Store processed embeddings for each granularity
#             gemini_granularities.append(gemini_weighted)
#             voyager_granularities.append(voyager_weighted)

#             # Combine model embeddings
#             granularity_combined = (
#                 weights['gemini'] * gemini_weighted +
#                 weights['voyager'] * voyager_weighted
#             )
            
#             granularity_results[granularity] = granularity_combined
#             print(f"Granularity Completed: {granularity}")
        
#         # Aggregate results
#         combined_embedding = torch.stack([granularity_results[gran] for gran in self.granularities], dim=0).mean(dim=0)

#         self.cross_attention_weights = {
#             'gemini': gemini_prob_weights.detach().cpu(),
#             'voyager': voyager_prob_weights.detach().cpu(),
#             'combined': combined_embedding.detach().cpu()
#         }

#         # Final processing
#         fused_embedding = self.layer_norm(self.aggregation_layer(combined_embedding))
        
#         return fused_embedding, {
#             'processed_embeddings': granularity_results,
#             'diagnostics': diagnostics,
#             'cross_attention_weights': self.cross_attention_weights
#         }


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiLevelAttention(nn.Module):
    def __init__(self, output_dim=1024, top_n=5):
        super(MultiLevelAttention, self).__init__()
        
        self.output_dim = output_dim
        self.top_n = top_n
        self.granularities = ['sections', 'chapters', 'pages']

        # Query projectors
        self.gemini_query_projector = nn.Linear(768, 768)
        self.voyager_query_projector = nn.Linear(1024, output_dim)
        
        # Document projectors
        self.gemini_projector = nn.Linear(768, 768)
        self.voyager_projector = nn.Linear(1024, output_dim)

        # Aggregation
        self.layer_norm = nn.LayerNorm(output_dim)

    def forward(self, gemini_embeddings, voyager_embeddings, gemini_query_embedding, voyager_query_embedding):
        # Project query embeddings
        gemini_query_projected = self.gemini_query_projector(gemini_query_embedding)
        voyager_query_projected = self.voyager_query_projector(voyager_query_embedding)

        # Normalize query embeddings
        gemini_query_norm = F.normalize(gemini_query_projected, p=2, dim=1)
        voyager_query_norm = F.normalize(voyager_query_projected, p=2, dim=1)

        # Process embeddings per granularity
        # gemini_section_proj = self.gemini_projector(gemini_embeddings['sections'])
        # gemini_chapter_proj = self.gemini_projector(gemini_embeddings['chapters'])
        # gemini_page_proj = self.gemini_projector(gemini_embeddings['pages'])

        # voyager_section_proj = self.voyager_projector(voyager_embeddings['sections'])
        # voyager_chapter_proj = self.voyager_projector(voyager_embeddings['chapters'])
        # voyager_page_proj = self.voyager_projector(voyager_embeddings['pages'])

        gemini_section_proj = gemini_embeddings['sections']
        gemini_chapter_proj = gemini_embeddings['chapters']
        gemini_page_proj = gemini_embeddings['pages']

        voyager_section_proj = voyager_embeddings['sections']
        voyager_chapter_proj = voyager_embeddings['chapters']
        voyager_page_proj = voyager_embeddings['pages']

        # Normalize document embeddings
        gemini_section_norm = F.normalize(gemini_section_proj, p=2, dim=1)
        gemini_chapter_norm = F.normalize(gemini_chapter_proj, p=2, dim=1)
        gemini_page_norm = F.normalize(gemini_page_proj, p=2, dim=1)

        voyager_section_norm = F.normalize(voyager_section_proj, p=2, dim=1)
        voyager_chapter_norm = F.normalize(voyager_chapter_proj, p=2, dim=1)
        voyager_page_norm = F.normalize(voyager_page_proj, p=2, dim=1)

        # Compute cosine similarity using matrix multiplication
        gemini_section_similarities = torch.matmul(gemini_section_norm, gemini_query_norm.T).squeeze(1)
        gemini_chapter_similarities = torch.matmul(gemini_chapter_norm, gemini_query_norm.T).squeeze(1)
        gemini_page_similarities = torch.matmul(gemini_page_norm, gemini_query_norm.T).squeeze(1)

        voyager_section_similarities = torch.matmul(voyager_section_norm, voyager_query_norm.T).squeeze(1)
        voyager_chapter_similarities = torch.matmul(voyager_chapter_norm, voyager_query_norm.T).squeeze(1)
        voyager_page_similarities = torch.matmul(voyager_page_norm, voyager_query_norm.T).squeeze(1)

        # Apply softmax
        gemini_section_weights = F.softmax(gemini_section_similarities, dim=0)
        gemini_chapter_weights = F.softmax(gemini_chapter_similarities, dim=0)
        gemini_page_weights = F.softmax(gemini_page_similarities, dim=0)

        voyager_section_weights = F.softmax(voyager_section_similarities, dim=0)
        voyager_chapter_weights = F.softmax(voyager_chapter_similarities, dim=0)
        voyager_page_weights = F.softmax(voyager_page_similarities, dim=0)

        # Get top N indices
        gemini_section_top_values, gemini_section_top_indices = torch.topk(gemini_section_weights, self.top_n)
        gemini_chapter_top_values, gemini_chapter_top_indices = torch.topk(gemini_chapter_weights, self.top_n)
        gemini_page_top_values, gemini_page_top_indices = torch.topk(gemini_page_weights, self.top_n)

        voyager_section_top_values, voyager_section_top_indices = torch.topk(voyager_section_weights, self.top_n)
        voyager_chapter_top_values, voyager_chapter_top_indices = torch.topk(voyager_chapter_weights, self.top_n)
        voyager_page_top_values, voyager_page_top_indices = torch.topk(voyager_page_weights, self.top_n)

        # Store results in dictionary
        top_indices_dict = {
            'sections': {
                "gemini_top_values": gemini_section_top_values,
                "gemini_top_indices": gemini_section_top_indices,
                "voyager_top_values": voyager_section_top_values,
                "voyager_top_indices": voyager_section_top_indices
            },
            'chapters': {
                "gemini_top_values": gemini_chapter_top_values,
                "gemini_top_indices": gemini_chapter_top_indices,
                "voyager_top_values": voyager_chapter_top_values,
                "voyager_top_indices": voyager_chapter_top_indices
            },
            'pages': {
                "gemini_top_values": gemini_page_top_values,
                "gemini_top_indices": gemini_page_top_indices,
                "voyager_top_values": voyager_page_top_values,
                "voyager_top_indices": voyager_page_top_indices
            }
        }
        print("FINAL")
        return {
            'top_indices': top_indices_dict
        }


In [9]:
import torch
import os
from torch import nn
import torch.nn.functional as F
import pyarrow.parquet as pq
import numpy as np

def test_multi_level_attention(base_path, query):
    """
    Test function for MultiLevelAttention model using real embeddings from saved files.
    
    Args:
        base_path (str): Path to the directory containing embedding parquet files.
        query_embedding (torch.Tensor): Query embedding tensor with shape [768].
    
    Returns:
        None
    """
    global output
    # Load embeddings
    loader = LegalEmbeddingLoader(base_path)
    gemini_embeddings, voyager_embeddings, metadata = loader.load_embeddings()

    # Initialize the model
    model = MultiLevelAttention(output_dim=1024)

    # Analyze query
    query_analyzer = LegalQueryAnalyzer()
    query_output = query_analyzer.analyze_query(query)

    # Run the model
    output = model.forward(
        gemini_embeddings, 
        voyager_embeddings, 
        torch.tensor(query_output['gemini_embedding']),
        torch.tensor(query_output['voyage_embedding'])
    )
    
    for granularity, data in output['top_indices'].items():
        print(f"\n--- {granularity.capitalize()} ---")
        print(f"Gemini Top Indices: {data['gemini_top_indices']}")
        print(f"Voyager Top Indices: {data['voyager_top_indices']}")

base_path = "New_Embeddings_2025" 
query = """Whoever, being an officer, employee or agent of the United States or of any department or agency thereof, having received public money which he is not authorized to retain as salary, pay, or emolument, fails to render his accounts for the same as provided by law is guilty of embezzlement, and shall be fined under this title or in a sum equal to the amount of the money embezzled, whichever is greater, or imprisoned not more than ten years, or both; but if the amount embezzled does not exceed $1,000, he shall be fined under this title or imprisoned not more than one year, or both.
"""
test_multi_level_attention(base_path, query)


New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet

Columns in embeddings_gemini_text-005_chapters_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_chapters_semchunk.parquet (gemini - chapters)
New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet

Columns in embeddings_voyage_per_chapter_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_chapter_semchunked.parquet (voyager - chapters)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet

Columns in embeddings_gemini_text-005_pages_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_pages_semchunk.parquet (gemini - pages)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet

Columns in embeddings_voyage_per_pages_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_pages_se

NameError: name 'LegalQueryAnalyzer' is not defined

In [None]:
def check_text_in_indices(indices, source, model, input_text):
    """
    Check if the input_text is present in any of the retrieved texts from given indices.
    """
    results = {}
    for index in indices:
        content = get_processed_content_by_index(index, source, model)
        if content:
            results[index] = str(input_text).lower() in str(content).lower()
        else:
            results[index] = None  # Indicates that no content was found for the index
    return results


def check_text_in_indices_automated(top_indices_dict, input_text, model):
    """
    Automated function to check if the input_text is present in any of the retrieved texts
    from given indices specified in the top_indices_dict for the specified model.
    """
    results = {}
    
    for source, indices_dict in top_indices_dict.items():
        for key, indices in indices_dict.items():
            # Check for the right model and source combination
            if model == "gemini" and "gemini" in key:
                matches = check_text_in_indices(indices, source, "gemini", input_text)
                results[f"{model}_{source}_{key}"] = matches
            elif model == "voyage" and "voyager" in key:
                matches = check_text_in_indices(indices, source, "voyage", input_text)
                results[f"{model}_{source}_{key}"] = matches

    return results


# Call the automated function for "gemini" model
automated_matches = check_text_in_indices_automated(top_indices_dict, input_text, "gemini")
print(automated_matches)

# Call the automated function for "voyage" model
automated_matches_voyage = check_text_in_indices_automated(top_indices_dict, input_text, "voyage")
print(automated_matches_voyage)

In [None]:


import pandas as pd

# Load the embeddings files
gemini_sections_df = pd.read_parquet(r"New_Embeddings_2025\sections\embeddings_gemini_text-005.parquet")
gemini_pages_df = pd.read_parquet(r"New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet")
gemini_chapters_df = pd.read_parquet(r"New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet")

voyage_sections_df = pd.read_parquet(r"New_Embeddings_2025\sections\embeddings_voyage.parquet")
voyage_pages_df = pd.read_parquet(r"New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet")
voyage_chapters_df = pd.read_parquet(r"New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet")

def get_processed_content_by_index(index, source, model):
    """
    Retrieve the Processed_Content or chunk based on the specified index and source.

    Args:
        index (int): The row index to retrieve.
        source (str): The source dataset ("gemini_text", "gemini_pages", or "voyage").

    Returns:
        str: The corresponding Processed_Content or chunk.
    """
    if model== "gemini":
        if source == "sections":
            return gemini_sections_df.loc[index, "Processed_Content"] if index in gemini_sections_df.index else None
        elif source == "pages":
            return gemini_pages_df.loc[index, "chunk"] if index in gemini_pages_df.index else None
        elif source == "chapters":
            return gemini_chapters_df.loc[index, "chunk"] if index in gemini_chapters_df.index else None
        else:
            return "Invalid source specified."
    if model == "voyage":
        if source == "sections":
            return voyage_sections_df.loc[index, "Processed_Content"] if index in voyage_sections_df.index else None
        elif source == "pages":
            return voyage_pages_df.loc[index, "chunk"] if index in voyage_pages_df.index else None
        elif source == "chapters":
            return voyage_chapters_df.loc[index, "chunk"] if index in voyage_chapters_df.index else None
        else:
            return "Invalid source specified."

# Example Usage:
index_to_fetch = 456  # Change this index based on your needs
print(get_processed_content_by_index(index_to_fetch, source="sections", model= "gemini"))  # For Gemini Text
print("-----------------------------------------------")
print(get_processed_content_by_index(index_to_fetch, source="pages", model= "gemini"))  # For Gemini Pages
print("-----------------------------------------------")
print(get_processed_content_by_index(index_to_fetch, source="chapters", model= "gemini"))  # For Voyage


In all cases of murder or manslaughter, the offense shall be deemed to have been committed at the place where the injury was inflicted, or the poison administered or other means employed which caused the death, without regard to the place where the death occurs. (June 25, 1948, ch. 645, 62 Stat. 826.)

Historical and Revision Notes
Based on title 18, U.S.C., 1940 ed., §553 (Mar. 4, 1909, ch. 321, §336, 35 Stat. 1152).

-----------------------------------------------
1996—Subsec. (a). Pub. L. 104–132, §502(1)(A), substituted "nuclear material or nuclear byproduct
material" for "nuclear material" wherever appearing.
Subsec. (a)(1)(A). Pub. L. 104–132, §502(1)(B)(i), inserted "or to the environment" after "damage to
property".
Subsec. (a)(1)(B). Pub. L. 104–132, §502(1)(B)(ii), amended subpar. (B) generally. Prior to amendment,
subpar. (B) read as follows: "knows that circumstances exist which are likely to cause the death of or serious
bodily injury to any person or substantial damage to

In [None]:
def check_text_in_indices_automated(top_indices_dict, input_text, model):
    """
    Automated function to check if the input_text is present in any of the retrieved texts
    from given indices specified in the top_indices_dict for the specified model.
    """
    results = {}
    
    for source, indices_dict in top_indices_dict.items():
        for key, indices in indices_dict.items():
            # Check for the right model and source combination
            if model == "gemini" and "gemini" in key:
                matches = check_text_in_indices(indices, source, "gemini", input_text)
                results[f"{model}_{source}_{key}"] = matches
            elif model == "voyage" and "voyager" in key:
                matches = check_text_in_indices(indices, source, "voyage", input_text)
                results[f"{model}_{source}_{key}"] = matches

    return results



# Define the text to check
input_text = """Whoever, being an officer, employee or agent of the United States or of any department or agency thereof, having received public money which he is not authorized to retain as salary, pay, or emolument, fails to render his accounts for the same as provided by law is guilty of embezzlement, and shall be fined under this title or in a sum equal to the amount of the money embezzled, whichever is greater, or imprisoned not more than ten years, or both; but if the amount embezzled does not exceed $1,000, he shall be fined under this title or imprisoned not more than one year, or both.
"""

# Call the automated function for "gemini" model
automated_matches = check_text_in_indices_automated(top_indices_dict, input_text, "gemini")
print(automated_matches)

# Call the automated function for "voyage" model
automated_matches_voyage = check_text_in_indices_automated(top_indices_dict, input_text, "voyage")
print(automated_matches_voyage)


In [None]:
def check_text_in_indices(indices, source, model, input_text):
    """
    Check if the input_text is present in any of the retrieved texts from given indices.
    """
    results = {}
    for index in indices:
        content = get_processed_content_by_index(index, source, model)
        if content:
            results[index] = str(input_text).lower() in str(content).lower()
        else:
            results[index] = None  # Indicates that no content was found for the index
    return results

indices_to_check =[ 456, 1605,  145, 1604, 1741] 
input_text=  """Whoever, being an officer, employee or agent of the United States or of any department or agency thereof, having received public money which he is not authorized to retain as salary, pay, or emolument, fails to render his accounts for the same as provided by law is guilty of embezzlement, and shall be fined under this title or in a sum equal to the amount of the money embezzled, whichever is greater, or imprisoned not more than ten years, or both; but if the amount embezzled does not exceed $1,000, he shall be fined under this title or imprisoned not more than one year, or both.
"""
source = "chapters"  # Change to "pages" or "chapters" as needed
model = "gemini"  # Change to "voyage" if needed

matches = check_text_in_indices(indices_to_check, source, model, input_text)
print(matches)

{456: True, 1605: None, 145: False, 1604: None, 1741: None}


In [None]:
import torch
import os
from torch import nn
import torch.nn.functional as F
import pyarrow.parquet as pq
import numpy as np

def test_multi_level_attention(base_path, query):
    """
    Test function for MultiLevelAttention model using real embeddings from saved files.
    
    Args:
        base_path (str): Path to the directory containing embedding parquet files.
        query_embedding (torch.Tensor): Query embedding tensor with shape [768].
    
    Returns:
        None
    """
    # Load embeddings
    loader = LegalEmbeddingLoader(base_path)
    gemini_embeddings, voyager_embeddings, metadata = loader.load_embeddings()

    # Initialize the model
    model = MultiLevelAttention(output_dim=1024)

    # Define weights for combining 
    query_analyzer=LegalQueryAnalyzer()
    query_output=query_analyzer.analyze_query("What is the legal definition of mens rea?")
    weights = query_output['weights']

    # Run the model
    fused_embedding, details = model.forward(gemini_embeddings, voyager_embeddings, torch.tensor(query_output['gemini_embedding']),torch.tensor(query_output['voyage_embedding']))

    # Print output details
    print("\nFused Embedding Shape:", fused_embedding.shape)
    print("\nProcessed Embeddings per Granularity:")
    for granularity, embedding in details['processed_embeddings'].items():
        print(f"  {granularity}: {embedding.shape}")

    print("\nCross-Attention Weights:")
    for model_name, weight_tensor in details['cross_attention_weights'].items():
        print(f"  {model_name}: {weight_tensor.shape}")


base_path = "New_Embeddings_2025" 
query = "What is the legal definition of mens rea?"
test_multi_level_attention(base_path, query)


New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_gemini_text-005_chapters_semchunk.parquet

Columns in embeddings_gemini_text-005_chapters_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_chapters_semchunk.parquet (gemini - chapters)
New_Embeddings_2025
New_Embeddings_2025\chapters\embeddings_voyage_per_chapter_semchunked.parquet

Columns in embeddings_voyage_per_chapter_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_chapter_semchunked.parquet (voyager - chapters)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_gemini_text-005_pages_semchunk.parquet

Columns in embeddings_gemini_text-005_pages_semchunk.parquet: ['chunk', 'Embedding']
Loaded embeddings_gemini_text-005_pages_semchunk.parquet (gemini - pages)
New_Embeddings_2025
New_Embeddings_2025\pages\embeddings_voyage_per_pages_semchunked.parquet

Columns in embeddings_voyage_per_pages_semchunked.parquet: ['chunk', 'Embedding']
Loaded embeddings_voyage_per_pages_se

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]


Processing granularity: sections
Shape of Gemini Embedding: torch.Size([1647, 768])
Shape of Voyager Embedding: torch.Size([1647, 1024])
--------------------------------------------------------
Shape of Gemini Projected Embedding: torch.Size([1647, 1024])
Shape of Voyager Projected Embedding: torch.Size([1647, 1024])
--------------------------------------------------------
Shape of Gemini Query Embedding: torch.Size([1, 1024])
Shape of Voyager Query Embedding: torch.Size([1, 1024])
--------------------------------------------------------
Shape of Gemini Similarity: torch.Size([1, 1647])
Shape of Voyager Similarity: torch.Size([1, 1647])
--------------------------------------------------------
Shape of Gemini Weights After Softmax: torch.Size([1, 1647])
Shape of Voyager Weights After Softmax: torch.Size([1, 1647])
--------------------------------------------------------
Shape of Gemini Weighted Embedding: torch.Size([1, 1647])
Shape of Voyager Weighted Embedding: torch.Size([1, 1647])
-

RuntimeError: stack expects each tensor to be equal size, but got [1, 1647] at entry 0 and [1, 809] at entry 1

## Testing

In [None]:
import torch

# Define the projection matrix P and weights W
P = torch.tensor([
    [1.0, 2.0],
    [3.0, 4.0],
    [5.0, 6.0]
])  # Shape: [3, 2]

W = torch.tensor([0.1, 0.3, 0.6])  # Shape: [3]

# Reshape W to [3, 1] for broadcasting
W_reshaped = W.reshape(-1, 1)  # Shape: [3, 1]

print("P shape:", P.shape)
print("W_reshaped shape:", W_reshaped.shape)

# Perform element-wise multiplication
W_T_P = W_reshaped * P
print("After element-wise multiplication:")
print(W_T_P)

# Sum along dimension 0 (rows) and keep dimensions
result = W_T_P.sum(dim=1, keepdim=True)
print("\nFinal result:")
print(result)

# If you need to transpose the result (as in the original code)
result_transposed = result.T
print("\nTransposed result:")
print(result_transposed)


P shape: torch.Size([3, 2])
W_reshaped shape: torch.Size([3, 1])
After element-wise multiplication:
tensor([[0.1000, 0.2000],
        [0.9000, 1.2000],
        [3.0000, 3.6000]])

Final result:
tensor([[0.3000],
        [2.1000],
        [6.6000]])

Transposed result:
tensor([[0.3000, 2.1000, 6.6000]])
