# RAG with Lanchain framework 

In [None]:
# Required imports
import getpass
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter


In [2]:
#set Groq API key for accessing the LLM
os.environ["GROQ_API_KEY"] = getpass.getpass()


 ········


## Loading and Splitting the data

In [3]:
# Load the PDF document using PyMuPDFLoader
file_path = "./data/AI_and_Technology_Knowledge_Base.pdf"  # Specify  file path here
loader = PyMuPDFLoader(file_path)
data = loader.load()

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# Split documents into smaller chunks using NLTKTextSplitter: Linguistic-Based Splitting
text_splitter = NLTKTextSplitter(
    chunk_size=300,
    chunk_overlap=20
)

In [6]:
documents = text_splitter.split_documents(data)
print(len(documents))

26


In [7]:
documents[0]


Document(metadata={'source': './data/AI_and_Technology_Knowledge_Base.pdf', 'file_path': './data/AI_and_Technology_Knowledge_Base.pdf', 'page': 0, 'total_pages': 2, 'format': 'PDF 1.4', 'title': 'AI_and_Technology_Knowledge_Base.docx', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Skia/PDF m131 Google Docs Renderer', 'creationDate': '', 'modDate': '', 'trapped': ''}, page_content='AI and Technology Knowledge Base\nIntroduction to Artificial Intelligence\nArtificial Intelligence (AI) is a branch of computer science that focuses on creating machines\ncapable of performing tasks that would normally require human intelligence.')

In [22]:
# Another method for splitting which did not give good splits in chunks
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#documents = text_splitter.split_documents(data)
#print(len(documents))

8


## Embedding & Vector Database

In [8]:
# Set up embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Set up a vector store using Chroma
persist_directory = "./db"  # Directory to store the vector database
vector_store = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)


## LLM and Retrieval Mechanism

In [9]:

# Define your ChatGroq model with desired parameters
llm = ChatGroq(
    model="mixtral-8x7b-32768",
    temperature=0,  # Control the randomness of the output
    max_tokens=None,  # Max tokens for each response
    timeout=None,  # Timeout for each request
    max_retries=2  # Number of retries on failures
)

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})  # Adjust `k` as needed


## Building a Chatbot interface

In [11]:
# Defined a custom prompt template
PROMPT_TEMPLATE = """
You are a helpful assistant. Use the following context to answer the user's question accurately. Do not make up information or include anything not present in the context. If you are not sure about any answer ask the user for calrification.

Context:
{context}

Conversation History:
{conversation_history}

User's Question:
{user_input}

Assistant's Response:
"""

In [12]:
def chat_with_llm(prompt, chat_history=[], custom_template=PROMPT_TEMPLATE):
    try:
        # Combine the chat history with the current prompt for contextual awareness
        conversation_context = "\n\n".join([f"User: {entry['user']}\nAssistant: {entry['assistant']}" for entry in chat_history])

        # Retrieve relevant documents based on the current prompt using the retriever
        docs = retriever.invoke(prompt)

        # Combine the relevant documents into a context string
        context = "\n\n".join([f"Document {i + 1}: {doc.page_content}" for i, doc in enumerate(docs)])

        # Format the model input using the custom prompt template to achieve clarification if answer not in documents
        model_input = custom_template.format(
            context=context,
            conversation_history=conversation_context,
            user_input=prompt
        )

        # Generate a response using the LLM with temperature set to 0 for factual responses
        response = llm.invoke(model_input, temperature=0)

        # Add the current interaction to chat history for context in the next interaction
        chat_history.append({
            "user": prompt,
            "assistant": response.content
        })

        # Print the model's response
        print(f"LLM Response: {response.content}\n")

        # Print the retrieved sources with their content, limiting to top 3 which can be incresed
        print("Top 3 Sources and Relevant Content:")
        for i, doc in enumerate(docs[:3]):  # Limit to the first 3 documents
            source_info = doc.metadata.get('source', 'No Source Information')
            print(f"{i + 1}: {source_info}")
            print(f"Content: {doc.page_content[:300]}...")  # Displaying first 300 characters for brevity
            print("-" * 50)

    except Exception as e:
        print(f"Error: {e}")

In [13]:
# Chatbot in interactive mode with memory and a custom prompt template
def start_interactive_chat():
    chat_history = []  # Initialize an empty chat history
    print("Welcome to the chatbot! Type 'exit' to quit.\n")
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Goodbye!")
            break

        # Call the chat_with_llm function with the current input, chat history, and custom prompt template
        chat_with_llm(user_input, chat_history, custom_template=PROMPT_TEMPLATE)

# Start the interactive chat
start_interactive_chat()

Welcome to the chatbot! Type 'exit' to quit.



You:  Can you explain blockchain?


LLM Response: Sure, I'd be happy to explain blockchain based on the provided documents.

Blockchain is a type of technology that underpins cryptocurrencies like Bitcoin and Ethereum, but its applications extend far beyond digital currencies. It's essentially a chain of blocks, where each block contains a set of transactions. This technology is decentralized and distributed, meaning it's not controlled by any single entity and transactions are recorded across multiple computers in a way that can't be altered retroactively.

Each participant (or node) in the blockchain has a copy of the entire ledger, eliminating the need for intermediaries like banks in peer-to-peer transactions. Once a block is completed, it's added to the chain in a linear, chronological order, and the record of the transaction is permanently stored.

One of the key features of blockchain is its security. This is ensured by linking each block in the chain to the previous block using a cryptographic hash, which maintai

You:  what is a block?


LLM Response: A block, in the context of blockchain technology, is a unit of data storage that holds a set of transactions. Once a block is filled with transactions, it is added to the blockchain in a linear, chronological order. The record of these transactions is permanently stored in the blockchain. Each block is linked to the previous block using a cryptographic hash, which helps maintain the integrity and security of the entire chain.

Top 3 Sources and Relevant Content:
1: ./data/AI_and_Technology_Knowledge_Base.pdf
Content: Blockchain technology underpins cryptocurrencies like Bitcoin and
Ethereum, but its applications extend far beyond digital currencies.

In essence, a blockchain
is a chain of blocks, where each block contains a set of transactions....
--------------------------------------------------
2: ./data/AI_and_Technology_Knowledge_Base.pdf
Content: Once a block is
completed, it is added to the chain in a linear, chronological order, and the record of the
transaction i

You:  what was my first question?


LLM Response: Your first question was, "Can you explain blockchain?" I provided an explanation based on the documents provided, describing blockchain as a type of technology that underpins cryptocurrencies, a chain of blocks where each block contains a set of transactions, and its decentralized and distributed nature, among other features.

Top 3 Sources and Relevant Content:
1: ./data/AI_and_Technology_Knowledge_Base.pdf
Content: In the 1950s, computer scientists like Alan Turing and John McCarthy
laid the foundation for AI by exploring questions of machine intelligence.

Turing's 'Turing
Test' remains a benchmark for evaluating a machine's ability to exhibit intelligent behavior
indistinguishable from that of a human....
--------------------------------------------------
2: ./data/AI_and_Technology_Knowledge_Base.pdf
Content: Machine learning models can be trained to recognize patterns in data,
enabling them to make predictions and decisions.

In recent years, advances in neural
netw

You:  exit


Goodbye!
