### Import the libraries

In [None]:
import os
import gc

import torch
from dotenv import load_dotenv
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM  
from langchain_community.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from pymongo import MongoClient
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_mongodb import MongoDBAtlasVectorSearch

from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)

### Set the API token and other secret keys

In [None]:
load_dotenv()   

In [None]:
MONGODB_URI = os.environ["MONGODB_URI"]

In [None]:
# make Hub downloads resilient on slower links
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "180"
os.environ["HF_HUB_DOWNLOAD_RETRY"]   = "20"

In [None]:
# Set the MongoDB URI, DB, Collection Names

client = MongoClient(MONGODB_URI)
dbName = "hybridModel_mongodb_chunks"
collectionName = "chunked_data"
collection = client[dbName][collectionName]
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

### Read the research papers

In [None]:
dataset_path = r"D:\Intelligent QA AI\research_docs"
all_docs = []

for file in os.listdir(dataset_path):
    if file.endswith('.pdf'): 
        
        file_path = os.path.join(dataset_path, file)
        loader = PyPDFLoader(file_path, mode="single")
        docs = loader.load()
        
        if len(docs[0].page_content.split(" ")) > 20: #avoiding storing empty pages 
            all_docs.append(docs[0])

In [None]:
print(len(all_docs))

In [None]:
doc = all_docs[0]
doc.page_content

In [None]:
doc.metadata

### Split the text into chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=900,
        chunk_overlap=100,
        length_function=len
    )

chunks = text_splitter.split_documents(all_docs)

In [None]:
len(chunks)

In [None]:
for i in range (0,9):
    print(chunks[i].page_content)
    print("\n")

In [None]:
gc.collect()

### Creating Vector Embeddings & Storing Embeddings in a Vector Database

In [None]:
embedding = HuggingFaceEmbeddings(
    model_name="NeuML/pubmedbert-base-embeddings"
)

In [None]:
# vector_store = FAISS.from_documents(chunks, embedding)
# vectorStore = MongoDBAtlasVectorSearch.from_documents(chunks, embedding, 
#                                                       collection=collection)

# vectorStore = MongoDBAtlasVectorSearch.from_connection_string(MONGODB_URI,
#     dbName + "." + collectionName,
#     embedding,
#     index_name=index,
# )

vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks,
    embedding=embedding,
    collection=collection,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)

In [None]:
vector_store = MongoDBAtlasVectorSearch.from_connection_string(MONGODB_URI, dbName + "." + collectionName,
                                                              embedding, index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME)

In [None]:
gc.collect()

### Find the similiar chunks from the database

In [None]:
question = "What is hybrid modeling approach?"
searchDocs = vector_store.similarity_search(question, k=3)
# the query text is automatically embedded internally using the same embedding model you used to create your vector store.

In [None]:
for i in range(len(searchDocs)):
    print(searchDocs[i].page_content)
    print("\n")

### Load the tokenizer and count the tokens

In [None]:
model_id  = "TheBloke/PMC_LLAMA-7B-GPTQ"         # dash, not underscore

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [None]:
total_tokens = 0
for i in range(len(searchDocs)):
    tokens = tokenizer(searchDocs[i].page_content)
    num_tokens = len(tokens['input_ids'])
    total_tokens = total_tokens + num_tokens
print("Number of tokens in input prompt:", total_tokens)

In [None]:
print(f"Tokenizer model max length: {tokenizer.model_max_length}")

In [None]:
from transformers import AutoConfig

# Load the model configuration
config = AutoConfig.from_pretrained("TheBloke/PMC_LLAMA-7B-GPTQ")

# Check various token-related parameters
print(f"Max position embeddings: {config.max_position_embeddings}")
print(f"Model max length: {getattr(config, 'max_length', 'Not specified')}")
print(f"N positions: {getattr(config, 'n_positions', 'Not specified')}")


### Load LLM model

In [None]:
gc.collect()

In [None]:
os.makedirs("./model_offload", exist_ok=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_id,
    device_map="auto",
    max_memory={0: "5GB", "cpu": "14GB"},  # Adjust based on your system
    offload_folder="./model_offload",
    use_safetensors=True,
    trust_remote_code=True
)


In [None]:
model

### Make inference through LLM by providing the context

In [None]:
context_text = "\n\n".join([doc.page_content for doc in searchDocs])

question = "What is hybrid modeling?"

# Create the prompt
prompt = f"""Based on the following context, please answer the question. Answer the question in descriptive way atleast in 4-5 lines.

Context: {context_text}

Question: {question}

Answer:"""


In [None]:
# Generate answer
#The truncation=True parameter acts as a safety mechanism that automatically cuts off your text if it exceeds 
# the specified max_length.
inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
inputs['input_ids'].shape

In [None]:
model.device

In [None]:
# Move inputs to the same device as the model
inputs = inputs.to(model.device)  # or inputs.to("cuda") if you know it's on GPU

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    min_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    stop_strings=["\n\nQuestion:", "\nQuestion:", "Question:"],
    tokenizer=tokenizer
)

# Extract just the answer
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = full_response[len(prompt):].strip()
print("Question:", question)
print("Answer:", answer)


### Post-process the answer

In [None]:
# Stop at various unwanted patterns
stop_patterns = [
    "\nContext:",
    "\nQuestion:", 
    "\n\nQuestion:",
    "\nQ:",
    "Context:",
    "Question:",
    "\n\n\n"
]

for pattern in stop_patterns:
    if pattern in answer:
        answer = answer.split(pattern)[0].strip()
        break

print("Question:", question)
print("Answer:", answer)

### Chain of thought prompting

In [None]:
prompt = f"""Use the provided context to answer the question step-by-step.

Context: {context_text}
Question: {question}

Think through this step by step:
Step 1: What relevant information does the context provide?
Step 2: What can we infer from this information?
Step 3: What additional reasoning is needed?
Step 4: What is the final answer?

Step-by-step reasoning:"""

In [None]:
# Move inputs to the same device as the model
inputs = inputs.to(model.device)  # or inputs.to("cuda") if you know it's on GPU

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    min_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    stop_strings=["\n\nQuestion:", "\nQuestion:", "Question:"],
    tokenizer=tokenizer
)

# Extract just the answer
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = full_response[len(prompt):].strip()
print("Question:", question)
print("Answer:", answer)


### BAsed on guidelines

In [None]:
# Optimized for your PMC_LLAMA + MongoDB Atlas setup
prompt = f"""As a medical AI assistant, analyze the following research context to provide evidence-based information.

Retrieved Medical Literature:
{context_text}

Clinical Question: {question}

Guidelines:
- Base your response on the provided literature only
- Cite specific studies or papers when available
- If information is limited, state "Based on the available literature..."
- Maintain clinical accuracy and appropriate caution

Evidence-based Response:"""


In [None]:
# Move inputs to the same device as the model
inputs = inputs.to(model.device)  # or inputs.to("cuda") if you know it's on GPU

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    min_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    stop_strings=["\n\nQuestion:", "\nQuestion:", "Question:"],
    tokenizer=tokenizer
)

# Extract just the answer
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = full_response[len(prompt):].strip()
print("Question:", question)
print("Answer:", answer)


In [None]:
prompt = f"""
TASK: Analyze literature and provide evidence-based information.

ROLE: You are a literature analyst with expertise in different techniques used in bio-pharama.

INSTRUCTIONS:
1. READ the provided vontext carefully
2. IDENTIFY relevant evidence that directly addresses the question
3. ANALYZE the strength and quality of the evidence
4. SYNTHESIZE findings into a coherent, useful response
5. DO NOT add external knowledge or assumptions
6. IF evidence is insufficient, state: "The provided literature contains limited evidence for..."

INPUT:
Medical Literature: {context_text}
Clinical Question: {question}

Begin your evidence-based analysis:
"""


In [None]:
inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True)
inputs['input_ids'].shape

In [None]:
# Move inputs to the same device as the model
inputs = inputs.to(model.device)  # or inputs.to("cuda") if you know it's on GPU

outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    min_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
    stop_strings=["\n\nQuestion:", "\nQuestion:", "Question:"],
    tokenizer=tokenizer
)

# Extract just the answer
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = full_response[len(prompt):].strip()
print("Question:", question)
print("Answer:", answer)


In [None]:
context_text

In [None]:
gc.collect()

In [None]:
vector_store.delete(ids=None)