In [1]:
import os,sys, re, json, nest_asyncio, asyncio, numpy as np, pandas as pd

SRC_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

from dotenv import load_dotenv

from google import genai
from google.genai import types, Client
from google.genai.types import EmbedContentConfig
from google.cloud import secretmanager, storage,aiplatform

from typing import List, Dict, Any, Union, Tuple
from pydantic import BaseModel, Field

import matplotlib.pyplot as plt
import seaborn as sns

from langchain_core.documents import Document

load_dotenv()



LLAMA_PARSE_API_KEY = os.environ.get("LLAMA_PARSE_API_KEY")
HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

PROJECT_ID = os.environ.get("PROJECT_ID")
LOCATION = os.environ.get("LOCATION")
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Imports

In [2]:
from document_processing.keyword_annotator import BM25KeywordAnnotator, TFIDFKeywordAnnotator, QueryProcessor
from document_processing import TextDirectoryLoader
from indexing.inverted_index import InvertedIndex

# Document Loaders

In [29]:
loader_bm25 = TextDirectoryLoader(
    directory="../Data/parsed",
    annotator=BM25KeywordAnnotator({
        "score_threshold": 1.5,
        "extra_stopwords": {"page", "pages", "figure"},
        "max_keywords": 15,
    }),
) 

loader_tfidf = TextDirectoryLoader(
    directory="../Data/parsed",
    annotator=TFIDFKeywordAnnotator({
        "score_threshold": 1.5,
        "extra_stopwords": {"page", "pages", "figure"},
        "max_keywords": 15,
    })
)

all_docs = loader_bm25.load()

# Create Inverted Indexes

In [30]:
inverted_index = InvertedIndex()
inverted_index.build_from_docs(all_docs)

Building inverted index from scratch...
Updating index with 6 new document(s)...
Update complete.
Index built successfully. Found 2357 unique keywords across 6 files.


# Search the inverted index

In [43]:
search_term = "semi-supervised"
print(f"Searching for keyword: {search_term}") 
QP = QueryProcessor()
processed_search_terms = QP.process(query=search_term) 
Search_Results = []
for processed_search_term in processed_search_terms:
    if processed_search_term:
        print(f"Processed search term: {processed_search_term}")
        search_results = inverted_index.search(processed_search_term)
        Search_Results.extend(search_results)
Search_Results = list(set(Search_Results))  # Remove duplicates
print(f"Search results for '{search_term}': {Search_Results}")

Searching for keyword: semi-supervised
Processed search term: semi
Processed search term: supervised
Search results for 'semi-supervised': [('attention-is-all-you-need_cleaned.txt', 9), ('attention-is-all-you-need_cleaned.txt', 10)]


In [44]:
Search_Results

[('attention-is-all-you-need_cleaned.txt', 9),
 ('attention-is-all-you-need_cleaned.txt', 10)]