# DownLoad Libraries #

In [None]:
! pip install torch
! pip install --upgrade transformers
! pip install accelerate huggingface_hub

! pip install pymupdf
! python3 install spacy
! python3 -m spacy download en_core_web_trf
! pip install transformers
! pip install torch
! pip install rank_bm25

! python3 -m nltk.downloader wordnet
! pip install nltk
! pip install faiss-cpu

! pip install --upgrade pip setuptools wheel
! pip install bertopic --no-cache-dir
! pip uninstall hdbscan -y
! pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation


# Hugging Face Login #

In [None]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face token
token = 'your_token_here'
login(token)

 # Import Libraries #

In [61]:
import fitz  # PyMuPDF
import spacy
from transformers import pipeline, AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
import numpy as np
import torch
import re
import json
import nltk
from nltk.corpus import wordnet
import transformers

In [None]:
# Download necessary NLTK resources
nltk.download('wordnet')

#  Task 1: Text Extraction from PDF #

In [63]:
# Task 1: Text Extraction from PDF
def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    pdf_document.close()
    return text


#  Task 2: Hierarchical Tree-based Indexing #

In [64]:
# Task 2: Hierarchical Tree-based Indexing
def preprocess_text(text):
    text = re.sub(r'\n+', '\n', text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [65]:
def split_text(text):
    return re.split(r'\n{2,}', text)

In [66]:
def create_hierarchical_index(text):
    nlp = spacy.load('en_core_web_trf')
    classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
    candidate_labels = ["Chapter", "Section", "Subsection"]
    tree = {"Root": {}}
    current_chapter = None
    current_section = None
    text = preprocess_text(text)
    paragraphs = split_text(text)

    for para in paragraphs:
        if not para.strip():
            continue
        classification = classifier(para, candidate_labels)
        label = classification['labels'][0]
        if label == "Chapter":
            current_chapter = para
            tree["Root"][current_chapter] = {}
            current_section = None
        elif label == "Section":
            if current_chapter:
                current_section = para
                tree["Root"][current_chapter][current_section] = []
        else:
            if current_section:
                tree["Root"][current_chapter][current_section].append(para)

    flat_index = []
    def flatten_tree(node, path=[]):
        for key, value in node.items():
            if isinstance(value, dict):
                flatten_tree(value, path + [key])
            else:
                flat_index.append({"path": path + [key], "content": value})

    flatten_tree(tree)
    return flat_index

# Task 3: Retrieval Techniques #

In [None]:
tokenizer = AutoTokenizer.from_pretrained('facebook/dpr-question_encoder-multiset-base')
model = AutoModel.from_pretrained('facebook/dpr-question_encoder-multiset-base')

In [68]:
def encode_query(query):
    inputs = tokenizer(query, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(**inputs).pooler_output
    return embeddings.squeeze().numpy()

In [69]:
def encode_documents(documents):
    encoded_docs = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt')
        with torch.no_grad():
            embeddings = model(**inputs).pooler_output
        encoded_docs.append(embeddings.squeeze().numpy())
    return np.array(encoded_docs)

In [70]:
def expand_query(query):
    synonyms = set()
    for syn in wordnet.synsets(query):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return ' '.join(synonyms)

In [71]:
def retrieve_bm25(query, documents):
    if not documents:
        return np.array([])  # Handle empty documents list
    tokenized_docs = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_docs)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    return scores

In [72]:
def retrieve_dpr(query, documents):
    if not documents:
        return np.array([])  # Handle empty documents list
    query_embedding = encode_query(query)
    document_embeddings = encode_documents(documents)
    similarities = np.dot(document_embeddings, query_embedding)
    return similarities


In [73]:
def retrieve_documents(query, documents):
    if not documents:
        return []  # Handle empty documents list
    bm25_scores = retrieve_bm25(query, documents)
    dpr_scores = retrieve_dpr(query, documents)
    if len(bm25_scores) == 0 or len(dpr_scores) == 0:
        return []  # Handle empty scores
    combined_scores = bm25_scores + dpr_scores
    ranked_indices = np.argsort(combined_scores)[::-1]
    return [(documents[i], combined_scores[i]) for i in ranked_indices]

# Task 4 & 5: Retrieval Augmented Generation (RAG) #

In [77]:
def generate_response_from_context(context, query):
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"   # Make sure to accept the terms on the model card.

    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )

    # Prepare messages based on whether context is provided or not
    if context:
        prompt = f"Context: {context}\nQuery: {query}\nResponse:"
        messages = [
            {"role": "system", "content": "You are a helpful assistant!"},
            {"role": "user", "content": prompt}
        ]
    else:
        prompt = f"Query: {query}\nResponse:"
        messages = [
            {"role": "system", "content": "You are a helpful assistant!"},
            {"role": "user", "content": prompt}
        ]

    # Generate response
    outputs = pipeline(
        messages,
        max_new_tokens=256,
    )
    # Extract and return generated text
    output_text = outputs[0]["generated_text"][-1]['content']
    return output_text


#  Main Function to Integrate All Tasks #

In [78]:
def main(pdf_path, user_query):
    # Task 1: Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Task 2: Create hierarchical index
    hierarchical_index = create_hierarchical_index(extracted_text)
    documents = [doc["content"] for doc in hierarchical_index]

    # Task 3: Retrieve relevant documents based on query
    expanded_query = expand_query(user_query)
    retrieved_docs_with_scores = retrieve_documents(expanded_query, documents)
    relevant_texts = ' '.join([doc for doc, score in retrieved_docs_with_scores[:5]])  # Top 5 documents

    # Task 4 & 5: Generate a response using RAG
    response = generate_response_from_context(relevant_texts, user_query)

    return response


In [None]:
pdf_path = 'your_pdf_file_path'
user_query = "your_question_or_query"
response = main(pdf_path, user_query)
print(f"Response: {response}")