# **Downloading the necessary Packages**

In [None]:
!pip install pypdf



In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

# **Setting up the model**

In [None]:
pip install -qU langchain-groq

In [None]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain_groq import ChatGroq

model = ChatGroq(model="llama3-8b-8192")
### key : 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

In [None]:
response = model.invoke("Hi I am Sayan")
print(response.content)

Hi Sayan! Nice to meet you! Is there something I can help you with or would you like to chat?


In [None]:
## Embedding
!pip install sentence_transformers



In [None]:
!pip install llama-index



In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.llms import ChatMessage
from llama_index.core.prompts.prompts import SimpleInputPrompt

In [None]:
documents = SimpleDirectoryReader("/content/data").load_data()
documents

[Document(id_='d5191d32-3280-42f5-96de-14b3e6da1379', embedding=None, metadata={'page_label': '1', 'file_name': 'Synthesis_of_Chitosin_Chlorin_e6_conjugate_2.pdf', 'file_path': '/content/data/Synthesis_of_Chitosin_Chlorin_e6_conjugate_2.pdf', 'file_type': 'application/pdf', 'file_size': 1230494, 'creation_date': '2024-12-18', 'last_modified_date': '2024-12-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Indian Institute of Technology Guwahati\nQuick Submit\nQuick Submit\nDey\nSayan Sarkar\nDocument Details\nSubmission ID\ntrn:oid:::1:3078924115\nSubmission Date\nNov 14, 2024, 2:57 PM GMT+5:30\nDownload Date\nNov 14, 2024, 3:05 PM GMT

In [None]:
%pip install langchain_chroma langchain_openai



In [None]:
pip install langchain-huggingface



In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
!pip install langchain_community



In [None]:
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Split the documents using LangChain's RecursiveCharacterTextSplitter
def process_and_split_documents(documents):
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    # Split the documents into smaller chunks
    splits = []
    for doc in documents:
        if hasattr(doc, 'text'):  # Ensure the document has a 'text' attribute
            splits.extend(text_splitter.split_text(doc.text))
        else:
            print("Document does not have text content.")
    return splits

# Split the documents
split_texts = process_and_split_documents(documents)

# Generate embeddings using a local HuggingFace model
def create_embeddings_model():
    # Specify the HuggingFace model to use for embeddings
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize the embeddings model
embeddings_model = create_embeddings_model()

# Store the document chunks in Chroma
def store_in_chroma(text_chunks, embeddings_model):
    vectorstore = Chroma.from_texts(
        texts=text_chunks,
        embedding=embeddings_model,
        persist_directory="./chroma_storage"  # Directory to persist the vectorstore
    )
    # vectorstore.persist()
    return vectorstore

# Store split texts in Chroma
vectorstore = store_in_chroma(split_texts, embeddings_model)

# Use the vectorstore as a retriever
retriever = vectorstore.as_retriever()

# # Query the retriever
# query = "What is Chitosan"
# results = retriever.get_relevant_documents(query)

# # Display the results
# for i, result in enumerate(results):
#     print(f"Result {i + 1}:\n{result.page_content}\n")


In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.schema import ChatMessage, HumanMessage, AIMessage

# Define a prompt template for RAG
prompt_template = """
You are a helpful assistant. Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say you don't know. DO NOT try to make up an answer.

Context:
{context}

Question:
{question}

Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

In [None]:
# Step 6: Define a function to retrieve documents and query the LLM
def query_rag_pipeline(query, retriever, model):
    # Step 1: Retrieve relevant documents
    docs = retriever.get_relevant_documents(query)

    # Step 2: Combine retrieved documents into context
    context = "\n".join([doc.page_content for doc in docs])

    # Step 3: Format the prompt using the template
    formatted_prompt = prompt.format(context=context, question=query)

    # Step 4: Convert the prompt into a list of ChatMessage objects
    messages = [
        HumanMessage(content=formatted_prompt)  # ChatGroq expects this format
    ]

    # Step 5: Query the LLM with the list of ChatMessage objects
    response = model(messages)

    return response

Response: According to the context, Chitosan is a positively charged polymer made up of units of (1-4)-β-linked d-glucosamine (GlcN) and N-acetyl-d-glucosamine (GlcNAc).


In [None]:
# Example query
query = "What is Chitosan?"
response = query_rag_pipeline(query, retriever, model)

# Output the response
print("AI Response:", response.content)

AI Response: According to the text, Chitosan is a positively charged polymer made up of units of (1-4)-β-linked d-glucosamine (GlcN) and N-acetyl-d-glucosamine (GlcNAc). It has attracted significant interest in the field of biomaterials and biomedical research due to its excellent compatibility with biological tissues and ability to be easily degraded and absorbed by the body.


In [None]:
# Example query
query = "What is 'Attention is all you need'?"
response = query_rag_pipeline(query, retriever, model)

# Output the response
print("AI Response:", response.content)

AI Response: I don't know. The provided context does not mention "Attention is all you need". It seems to be a report or submission related to a photosensitizer and flags, but it does not appear to be related to the phrase "Attention is all you need".


In [None]:
# Example query
query = "Can you give me a summary of the document"
response = query_rag_pipeline(query, retriever, model)

# Output the response
print("Response:", response.content)

Response: Based on the provided context, it appears to be a page from a document titled "Integrity Submission" with a submission ID of trn:oid:::1:3078924115. The page contains a schematic representation of the synthesis of a CSO-Ce6 conjugate, which is a chemical compound.

However, I must note that the context provided is limited to a single page, and it does not provide a comprehensive summary of the entire document. If you would like me to provide more information, I would need additional context or access to the entire document.


In [80]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

# Define the structure of the output
class ResponseSchema(BaseModel):
    answer: str = Field(..., description="The answer to the user's question.")
    sources: list[str] = Field(..., description="A list of sources used to generate the answer.")


In [89]:
import re
import json
from typing import Optional, List, Dict, Any
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.schema import ChatMessage, HumanMessage, AIMessage
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, ValidationError

# Define a structured output model
class RAGResponse(BaseModel):
    answer: str = Field(description="Comprehensive answer to the query")
    source_documents: List[Dict[str, Any]] = Field(
        description="List of source documents used to generate the answer",
        default_factory=list
    )
    confidence_score: float = Field(
        description="Confidence of the answer (0-1 scale)",
        ge=0,
        le=1,
        default=0.0
    )

    class Config:
        json_schema_extra = {
            "example": {
                "answer": "The document discusses key strategies for project management.",
                "source_documents": [
                    {"page_content": "Excerpt from document 1", "metadata": {"source": "doc1.pdf", "page": 5}}
                ],
                "confidence_score": 0.85
            }
        }

def create_enhanced_prompt_template(output_parser: PydanticOutputParser):
    """
    Create an enhanced prompt template with explicit instructions and output parsing.

    Args:
        output_parser (PydanticOutputParser): Parser for structuring the output

    Returns:
        PromptTemplate: Enhanced prompt template
    """
    prompt_template = """You are a precise and helpful research assistant.
Analyze the provided context carefully and answer the question with the following guidelines:

1. Provide a comprehensive and accurate answer based strictly on the given context.
2. If the answer cannot be definitively found in the context, acknowledge this explicitly.
3. Assess and report your confidence in the answer.
4. Cite the specific sources used for your answer.

{format_instructions}

Context:
{context}

Question: {question}

Provide your response DIRECTLY as a JSON object matching the specified format.
Do NOT nest the response inside another key like "example"."""

    return PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"],
        partial_variables={
            "format_instructions": output_parser.get_format_instructions()
        }
    )

def extract_and_clean_json(text: str) -> Dict[str, Any]:
    """
    Extract and clean JSON from the response text.

    Args:
        text (str): Input text potentially containing JSON

    Returns:
        Dict[str, Any]: Cleaned JSON dictionary
    """
    try:
        # First, try to parse the entire text as JSON
        parsed = json.loads(text)

        # If there's a nested 'example' or similar key, extract it
        if isinstance(parsed, dict) and 'example' in parsed:
            return parsed['example']

        return parsed
    except json.JSONDecodeError:
        # If direct parsing fails, try to extract JSON manually
        try:
            # Use regex to find JSON object
            json_match = re.search(r'\{.*\}', text, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                parsed = json.loads(json_str)

                # Again, check for nested 'example'
                if isinstance(parsed, dict) and 'example' in parsed:
                    return parsed['example']

                return parsed
        except Exception:
            pass

    # Fallback
    return {}

def new_query_rag_pipeline(
    query: str,
    retriever,
    model,
    top_k: int = 3
):
    """
    Enhanced RAG pipeline with structured output and source tracking.

    Args:
        query (str): User's query
        retriever: Document retriever
        model: Language model
        top_k (int): Number of documents to retrieve

    Returns:
        RAGResponse: Structured response with answer, sources, and confidence
    """
    # Create output parser
    output_parser = PydanticOutputParser(pydantic_object=RAGResponse)

    # Create enhanced prompt
    prompt = create_enhanced_prompt_template(output_parser)

    # Retrieve relevant documents
    docs = retriever.get_relevant_documents(query)[:top_k]

    # Combine retrieved documents into context
    context = "\n\n".join([
        f"Document {i+1} (Page {doc.metadata.get('page', 'N/A')}):\n{doc.page_content}"
        for i, doc in enumerate(docs)
    ])

    # Prepare context-aware prompt
    formatted_prompt = prompt.format(
        context=context,
        question=query
    )

    # Query the LLM
    response = model([HumanMessage(content=formatted_prompt)])

    try:
        # Extract and clean JSON
        json_dict = extract_and_clean_json(response.content)

        # Parse the response
        parsed_response = RAGResponse(
            answer=json_dict.get('answer', 'No answer found'),
            confidence_score=json_dict.get('confidence_score', 0.0),
            source_documents=[]
        )

        # Manually add source document metadata
        parsed_response.source_documents = [
            {
                "page_content": doc.page_content,
                "metadata": doc.metadata
            } for doc in docs
        ]

        return parsed_response
    except Exception as e:
        # Fallback response if parsing fails
        return RAGResponse(
            answer=f"Error parsing response: {str(e)}. Original response: {response.content}",
            confidence_score=0.0,
            source_documents=[]
        )

In [90]:
query = "What is Chitosan?"
result = new_query_rag_pipeline(query, retriever, model)
print("AI Response:", result.answer)
print("Sources:", result.source_documents)
print("Confidence Score:", result.confidence_score)

AI Response: Chitosan is a positively charged polymer made up of units of (1-4)-β-linked d-glucosamine (GlcN) and N-acetyl-d-glucosamine (GlcNAc), which has excellent compatibility with biological tissues and can be easily degraded and absorbed by the body.
Sources: [{'page_content': '5 \n \nThe primary characteristic of irreversible chromophores is their photo \ncleavability, which makes them useful for regulated drug administration and \nphotodegradable materials. (“Cui and Del Campo, 2014”).  \nIn recent years, chitosan, a positively charged polymer made up of units of (1-\n4)-β-linked d -glucosamine (GlcN)  and N-acetyl-d-glucosamine (GlcNAc) , has \nattracted significant interest in the field of bio  medical research.  Chitosan has \nexcellent compatibility with biological tissues and can be easily deg raded and \nabsorbed by the body, because of this it is widely used for chemical applications.  \nMinimally invasive techniques used in tissue regeneration, advanced wound care, \na