In [1]:
import os
import fitz  # For PDF text extraction
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from accelerate import Accelerator  

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [3]:
accelerator = Accelerator()

In [4]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF."""
    with fitz.open(pdf_path) as pdf:
        text = ""
        for page in pdf:
            text += page.get_text()
    return text

In [5]:
def chunk_text(text, max_length=512, overlap=50):
    """Chunks a large text into smaller segments."""
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        chunk.append(word)
        if len(chunk) >= max_length:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # Retain the overlap for the next chunk
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

In [6]:
def process_multiple_pdfs(pdf_paths):
    all_chunks = {}
    for pdf_path in pdf_paths:
        print(f"Processing: {pdf_path}")
        pdf_text = extract_text_from_pdf(pdf_path)
        chunks = chunk_text(pdf_text)
        all_chunks[pdf_path] = chunks  # Store chunks for each PDF
    return all_chunks

In [7]:
pdf_paths = [ 
    r"D:\Prabha\NLP\Dataset\le8\le_8.pdf",
    r"D:\Prabha\NLP\Dataset\le9\le9.pdf",
    r"D:\Prabha\NLP\Dataset\le10\le10 (2).pdf",
    r"D:\Prabha\NLP\Dataset\le11\le11.pdf",
    r"D:\Prabha\NLP\Dataset\le12\le12.pdf",
    r"D:\Prabha\NLP\Dataset\le13\le13.pdf",
    r"D:\Prabha\NLP\Dataset\le14\le14.pdf",
    r"D:\Prabha\NLP\Dataset\le15\le15.pdf",
    r"D:\Prabha\NLP\Dataset\le1\Tamilnadu-Board-Class-12-Chemistry-Chapter-1.pdf",
    r"D:\Prabha\NLP\Dataset\le2\Tamilnadu-Board-Class-12-Chemistry-Chapter-2.pdf",
    r"D:\Prabha\NLP\Dataset\le3\Tamilnadu-Board-Class-12-Chemistry-Chapter-3.pdf",
    r"D:\Prabha\NLP\Dataset\le4\Tamilnadu-Board-Class-12-Chemistry-Chapter-4.pdf",
    r"D:\Prabha\NLP\Dataset\le5\12th_Chemistry-Vol-2_English Medium_Text.pdf",
    r"D:\Prabha\NLP\Dataset\le6\Tamilnadu-Board-Class-12-Chemistry-Chapter-6.pdf",
    r"D:\Prabha\NLP\Dataset\le7\Tamilnadu-Board-Class-12-Chemistry-Chapter-7.pdf"
]

In [8]:
chunks = process_multiple_pdfs(pdf_paths)

Processing: D:\Prabha\NLP\Dataset\le8\le_8.pdf
Processing: D:\Prabha\NLP\Dataset\le9\le9.pdf
Processing: D:\Prabha\NLP\Dataset\le10\le10 (2).pdf
Processing: D:\Prabha\NLP\Dataset\le11\le11.pdf
Processing: D:\Prabha\NLP\Dataset\le12\le12.pdf
Processing: D:\Prabha\NLP\Dataset\le13\le13.pdf
Processing: D:\Prabha\NLP\Dataset\le14\le14.pdf
Processing: D:\Prabha\NLP\Dataset\le15\le15.pdf
Processing: D:\Prabha\NLP\Dataset\le1\Tamilnadu-Board-Class-12-Chemistry-Chapter-1.pdf
Processing: D:\Prabha\NLP\Dataset\le2\Tamilnadu-Board-Class-12-Chemistry-Chapter-2.pdf
Processing: D:\Prabha\NLP\Dataset\le3\Tamilnadu-Board-Class-12-Chemistry-Chapter-3.pdf
Processing: D:\Prabha\NLP\Dataset\le4\Tamilnadu-Board-Class-12-Chemistry-Chapter-4.pdf
Processing: D:\Prabha\NLP\Dataset\le5\12th_Chemistry-Vol-2_English Medium_Text.pdf
Processing: D:\Prabha\NLP\Dataset\le6\Tamilnadu-Board-Class-12-Chemistry-Chapter-6.pdf
Processing: D:\Prabha\NLP\Dataset\le7\Tamilnadu-Board-Class-12-Chemistry-Chapter-7.pdf


In [9]:
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(texts=chunks, embedding=hf_embeddings)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

  hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [10]:
model_path = r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\model_6\save"
tokenizer = AutoTokenizer.from_pretrained(r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\model_6\save\tokenizer", pad_token="[PAD]")
model = AutoModelForCausalLM.from_pretrained(model_path) 

In [11]:
tokenizer.pad_token = tokenizer.eos_token

In [12]:
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,  
    max_new_tokens=256,
    truncation=True,
    return_full_text=False,
    pad_token_id=tokenizer.pad_token_id,
)

Device set to use cuda:0


In [13]:
model, tokenizer = accelerator.prepare(model, tokenizer)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
combine_documents_chain = load_qa_chain(llm=llm, chain_type="stuff")
rag_chain = RetrievalQA(combine_documents_chain=combine_documents_chain, retriever=retriever)

  llm = HuggingFacePipeline(pipeline=llm_pipeline)
stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  combine_documents_chain = load_qa_chain(llm=llm, chain_type="stuff")
  rag_chain = RetrievalQA(combine_documents_chain=combine_documents_chain, retriever=retriever)


In [17]:
query = input("Enter your query: ")

In [18]:
response = rag_chain.invoke({"query": query})

def clean_response(response):
    output = f"Question: {response['query']}\n\n"
    output += f"Answer:{response['result']}\n\n"
    output += "Feel free to ask more questions or clarify details."
    return output

print(clean_response(response))


Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: what is glucose?

Answer:
Glucose is the universal monosaccharide. It is present in blood and various
degrees and types of cells. It is formed from the following five
commonly-Used carbon-carbon bonds i.e. Carbon-carbon-bond, carbon-carbon-double bond, carbon-carbon-single bond, carbon-carbon-bond, carbon-carbon- double bond.
Source: Wikipedia. http://www.wikipedia.gov/

A:

Glucose is the carbon-carbon-bond
    Carbon-carbon-double bond
Carbon-carbon-single bond
    Carbon-carbon-bond
Carbon-carbon-bond
Carbon-carbon-double bond

A:

Glucose is the carbon-carbon-bond
Carbon-carbon-single bond
Carbon-carbon-bond
Carbon-carbon-double bond

A:

Glucose is the monosaccharide
    carbon-carbon-bond
Carbon-carbon-double bond



Feel free to ask more questions or clarify details.
