In [1]:
import os,sys, re, json, nest_asyncio, asyncio, numpy as np, pandas as pd

SRC_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

from dotenv import load_dotenv

from google import genai
from google.genai import types, Client
from google.genai.types import EmbedContentConfig
from google.cloud import secretmanager, storage,aiplatform

from typing import List, Dict, Any, Union, Tuple
from pydantic import BaseModel, Field

import matplotlib.pyplot as plt
import seaborn as sns

from langchain_core.documents import Document

load_dotenv()



LLAMA_PARSE_API_KEY = os.environ.get("LLAMA_PARSE_API_KEY")
HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

PROJECT_ID = os.environ.get("PROJECT_ID")
LOCATION = os.environ.get("LOCATION")
aiplatform.init(project=PROJECT_ID, location="global")

In [2]:
from langchain.chains.summarize import load_summarize_chain
from langchain_ollama import ChatOllama
from langchain_core.tools import tool
from pydantic import BaseModel, Field
from langchain_google_vertexai import ChatVertexAI
from langchain_core.documents import Document

llm_summary = ChatVertexAI(model_name="gemini-2.5-flash",temperature=0.1)
summary_chain = load_summarize_chain(llm_summary, chain_type="map_reduce")

llm_local = ChatOllama(model="qwen2.5:3b",temperature=0)
summary_chain_local = load_summarize_chain(llm_local, chain_type="map_reduce")

In [None]:
from document_processing.keyword_annotator import BM25KeywordAnnotator, TFIDFKeywordAnnotator, QueryProcessor
from document_processing import TextDirectoryLoader, ChunkedTextDirectoryLoader
from indexing.inverted_index import InvertedIndex 
from schemas.keywords import KeywordExtractionConfig
from document_processing.keyword_annotator import BM25KeywordAnnotator
from text_splitters import CustomTokenSplitter, CustomSemanticChunker, SpectralSegmentationChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")

bm25_annotator = BM25KeywordAnnotator({
    "score_threshold": 1.5,
    "extra_stopwords": {"page", "pages", "figure", "copyright", "©"},
    "max_keywords": 15,
})

page_splitter = KeywordExtractionConfig.model_validate({}).page_splitter

loader_bm25 = TextDirectoryLoader(
    directory="../Data/parsed",
    annotator=bm25_annotator,
) 

all_docs = loader_bm25.load()
sample_file = all_docs['attention-is-all-you-need_cleaned.txt']

# Chunking Example with various chunkers

In [None]:
merging_splitter = CustomTokenSplitter(
        chunk_size=400,
        chunk_overlap=100,
        keyword_annotator=bm25_annotator
    )

    # 3. Split the documents.
chunks_token_splitter = merging_splitter.split_documents(sample_file) 


semantic_splitter = CustomSemanticChunker(
    embeddings=emb,
    chunk_size=1_000,
    chunk_overlap=0,
    min_tokens=50,
    keyword_annotator=bm25_annotator,
    similarity_threshold=0.2,
)

chunks_semantic_splitter = semantic_splitter.split_documents(sample_file)  # list[Document]


chunker_spectral = SpectralSegmentationChunker(
    embeddings=emb,
    chunk_size=3_000,            # token budget for *final* chunks
    window_k=8,                  # neighbourhood size for similarity graph
    #n_splits=n_splits,           # how many strong-contrast cuts to keep
    min_words=50,                # enforce a sensible pre-chunk length
    keyword_annotator=bm25_annotator, # None if you don’t need per-chunk keywords
)

chunks_spectral_splitter = chunker_spectral.split_documents(sample_file)

In [None]:
print(f"""
Number of chunks using Token Splitter: {len(chunks_token_splitter)}
Number of chunks using Semantic Splitter: {len(chunks_semantic_splitter)}
Number of chunks using Spectral Segmentation: {len(chunks_spectral_splitter)}
""")

In [None]:
bm25_annotator = BM25KeywordAnnotator({
    "score_threshold": 1.5,
    "extra_stopwords": {"page", "pages", "figure", "copyright", "©"},
    "max_keywords": 25,
})

spectral_bm25_chunker = ChunkedTextDirectoryLoader(
    directory="../Data/parsed",
    chunker = SpectralSegmentationChunker(
        embeddings=emb,
        chunk_size=4_000,            # token budget for *final* chunks
        window_k=8,                  # neighbourhood size for similarity graph
        #n_splits=n_splits,           # how many strong-contrast cuts to keep
        min_words=40,                # enforce a sensible pre-chunk length
        keyword_annotator=bm25_annotator, # None if you don’t need per-chunk keywords
    ),
) 

all_chunks = spectral_bm25_chunker.load()

inverted_index = InvertedIndex()
inverted_index.build_from_docs(all_chunks)

In [13]:
def search_index(query: str, index: InvertedIndex) -> List[Document]:
    QP = QueryProcessor()
    processed_search_terms = QP.process(query=query)
    Search_Results = []
    for processed_search_term in processed_search_terms:
        if processed_search_term:
            search_results = index.search(processed_search_term)
            document_name = [result[0] for result in search_results]
            document_page = [int(result[1]) for result in search_results]
            Search_Results.append(pd.DataFrame({
                "document_name": document_name,
                "document_page": document_page,
                "search_term": processed_search_term}))
    return Search_Results

In [None]:
sr = search_index("google", inverted_index)
pd.concat(sr).reset_index(drop=True)

In [18]:
from document_processing.keyword_annotator import KeyBertAnnotator,ENGLISH_STOP_WORDS
fn = "attention-is-all-you-need_cleaned.txt"
sample_file = all_docs[fn]

kwa = KeyBertAnnotator(
        use_ngrams=True,
        keyphrase_ngram_range=(1, 1),
        top_n=15,
        diversity=0.9,
        STOPWORDS=ENGLISH_STOP_WORDS,
    )

kwa(sample_file,document_name=fn)

[Document(metadata={'source': 'attention-is-all-you-need_cleaned.txt', 'page_number': 1, 'keywords': ['translation', 'cost', 'model', 'representation', 'gomez', 'recurrence', 'fraction', 'german', 'training', 'brain', 'aidan', 'architecture', 'position', 'proper', 'sequence', 'ashish', 'new', 'dominant', 'free', 'vaswani', 'day', 'transduction', 'abstract', 'small', 'visualization', 'attribution', 'google', 'simple', 'task', 'network']}, page_content='arXiv:1706.03762v7 [cs.CL] 2 Aug 2023\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\n# Attention Is All You Need\n\nAshish Vaswani* Google Brain avaswani@google.com\n\nNoam Shazeer* Google Brain noam@google.com\n\nNiki Parmar* Google Research nikip@google.com\n\nJakob Uszkoreit* Google Research usz@google.com\n\nLlion Jones* Google Research llion@google.com\n\nAidan N. Gomez* † University of Toronto aidan@cs

In [10]:
loader_keybert = TextDirectoryLoader(
    directory="../temp/test-data",
    annotator=KeyBertAnnotator(
        use_ngrams=True,
        keyphrase_ngram_range=(1, 1),
        top_n=15,
        diversity=0.9,
        STOPWORDS=ENGLISH_STOP_WORDS,
    ),
)

In [12]:
kwa = KeyBertAnnotator(
        use_ngrams=True,
        keyphrase_ngram_range=(1, 1),
        top_n=15,
        diversity=0.9,
        STOPWORDS=ENGLISH_STOP_WORDS,
    )

spectral_kwa_chunker = ChunkedTextDirectoryLoader(
    directory="../temp/test-data",
    chunker = SpectralSegmentationChunker(
        embeddings=emb,
        chunk_size=4_000,            # token budget for *final* chunks
        window_k=8,                  # neighbourhood size for similarity graph
        #n_splits=n_splits,           # how many strong-contrast cuts to keep
        min_words=40,                # enforce a sensible pre-chunk length
        keyword_annotator=kwa, # None if you don’t need per-chunk keywords
    ),
) 

all_chunks_kwa = spectral_kwa_chunker.load()

inverted_index_kwa = InvertedIndex()
inverted_index_kwa.build_from_docs(all_chunks_kwa)


Processing file: attention-is-all-you-need_cleaned.txt
Number of chunks for attention-is-all-you-need_cleaned.txt: 18

Processing file: rental-agreement_cleaned.txt
Number of chunks for rental-agreement_cleaned.txt: 4
Building inverted index from scratch...
Updating index with 2 new document(s)...
Update complete.
Index built successfully. Found 144 unique keywords across 2 files.


In [16]:
sr = search_index("google", inverted_index_kwa)
pd.concat(sr).reset_index(drop=True)

Unnamed: 0,document_name,document_page,search_term
0,attention-is-all-you-need_cleaned.txt,1,google
