In [None]:
# Import libraries
import os
import gc
import warnings
import logging

import torch
from dotenv import load_dotenv
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM  
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_astradb import AstraDBVectorStore
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

warnings.filterwarnings("ignore")
logging.disable(logging.CRITICAL)  # Disable ALL logging below CRITICAL
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# Function to read the contents of PDFs
def read_pdfs(dataset_path):
    all_docs = []

    for file in os.listdir(dataset_path):
        if file.endswith('.pdf'): 

            file_path = os.path.join(dataset_path, file)
            loader = PyPDFLoader(file_path, mode="single")
            docs = loader.load()

            all_docs.append(docs[0]) 
            
    return all_docs

# Function to divide the extracted text into chunks
def generate_chunks(all_docs):
    
    text_splitter = CharacterTextSplitter(separator = "\n",
                                          chunk_size = 900, chunk_overlap = 100,
                                          length_function = len)

    chunks = text_splitter.split_documents(all_docs)
    
    return chunks
    
# Function to convert extract chunks into embeddings and store them in vector database
def store_embeddings(chunks):
    
    embedding = HuggingFaceEmbeddings(model_name = "NeuML/pubmedbert-base-embeddings")
    
    # Setting up vector store
    vstore = AstraDBVectorStore(embedding = embedding,
                                collection_name = "langchain_pdf_query",
                                api_endpoint = ASTRA_DB_API_ENDPOINT,
                                token = ASTRA_DB_APPLICATION_TOKEN)

    vstore.add_documents(chunks)
    astra_vector_index = VectorStoreIndexWrapper(vectorstore = vstore)
    
    return vstore

def inference_through_llm(vstore, query, tokenizer, model):
    
    # Find the similiar chunks from the database
    searchDocs = vstore.similarity_search(query, k=3)
    
    # Create the prompt
    context_text = "\n\n".join([doc.page_content for doc in searchDocs])
    prompt = f"""Based on the following context, please answer the question. Answer the question in descriptive way 
                 atleast in 4-5 lines.
                 Context: {context_text}
                 Question: {query}
                 Answer:"""
    
    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
    inputs = inputs.to(model.device)
    
    outputs = model.generate(**inputs, max_new_tokens = 100, min_new_tokens = 50, do_sample = True,
                            temperature = 0.7, top_p = 0.9, pad_token_id = tokenizer.eos_token_id,
                            stop_strings = ["\n\nQuestion:", "\nQuestion:", "Question:"],
                            tokenizer = tokenizer)
    
    # Extract just the answer
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response[len(prompt):].strip()
    
    stop_patterns = [
    "\nContext:",
    "\nQuestion:", 
    "\n\nQuestion:",
    "\nQ:",
    "Context:",
    "Question:",
    "\n\n\n"]

    for pattern in stop_patterns:
        if pattern in answer:
            answer = answer.split(pattern)[0].strip()
            break
    
    return answer
    
if __name__ == '__main__':
    
    # Load your API secret keys
    try:
        load_dotenv()
        ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
        ASTRA_DB_ID = os.environ["ASTRA_DB_ID"]
        ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
        
    except:
        print("Mention your API keys")
        
    # make Hub downloads resilient on slower links
    os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "180"
    os.environ["HF_HUB_DOWNLOAD_RETRY"]   = "20"
    
    # Read PDF's
    dataset_path = r"D:\Intelligent QA AI\research_docs"
    print("Extracting PDF's...")
    all_docs = read_pdfs(dataset_path)
    
    # Generate chunks
    print("Chunks are being created...")
    chunks = generate_chunks(all_docs)
    
    # Add chunks into vector database
    print("Chunks being added into vector database...")
    vstore = store_embeddings(chunks)
    
    # Load the model and tokenizer into memory
    model_id  = "TheBloke/PMC_LLAMA-7B-GPTQ"
    print("Loading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    os.makedirs("./model_offload", exist_ok=True)
    print("Loading Model...")
    model = AutoGPTQForCausalLM.from_quantized(model_id,
                                               device_map="auto",
                                               max_memory={0: "5GB", "cpu": "14GB"},  # Adjust based on your system
                                               offload_folder="./model_offload", use_safetensors=True, trust_remote_code=True)
    # Enter user query
    print("RAG Question Answering System")
    print("Type 'quit', 'exit', or 'stop' to end the session")
    print("-" * 50)

    while True:
        query = input("\nAsk your query: ").strip()

        # Check for exit commands
        if query.lower() in ['quit', 'exit', 'stop', 'q']:
            print("Thank you for using the RAG system. Goodbye!")
            break

        # Skip empty queries
        if not query:
            print("Please enter a valid query.")
            continue

        # Process the query
        answer = inference_through_llm(vstore, query, tokenizer, model)
        print(f"\nAnswer: {answer}")
    
    await vstore.aclear()

Extracting PDF's...
Chunks are being created...
Chunks being added into vector database...
Loading Tokenizer...
Loading Model...
RAG Question Answering System
Type 'quit', 'exit', or 'stop' to end the session
--------------------------------------------------

Answer: Hybrid modeling is a promising approach to capturing the


Examples of hybrid modeling

Answer: Hybrid modeling approaches combine the flexibility of machine learning (ML)
with the rigor of process-based models (FPMs). ML models are able to be trained on
a wide range of data, from continuous variables to categorical values, and they can
be used to make predictions even when data are scarce. FPMs, on the other hand, are
limited to systems for which there is a rich body of knowledge and mathematical
formulation, such as
Thank you for using the RAG system. Goodbye!
