### For this Retrieval Augmented Generation i.e. RAG System , I have used

- LangChain

- Only open soruce Huggingface Models

Firstly we will chat with -

- single text document

- and then with Multiple PDF Files

In [None]:
!pip install langchain huggingface_hub sentence_transformers faiss-cpu unstructured chromadb Cython tiktoken unstructured[local-inference] -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.8/812.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.6/276.6 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
import getpass
import os

if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Provide your HUGGINGFACEHUB TOKEN")

# HUGGINGFACEHUB_API_TOKEN = "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"

#Text File

In [None]:
import requests

url = "https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt"

res = requests.get(url)

with open("state_of_the_union.txt", "w") as f:
  f.write(res.text)

In [None]:
# Document Loader
from langchain.document_loaders import TextLoader

loader = TextLoader('./state_of_the_union.txt')

documents = loader.load()

In [None]:
documents

In [None]:
# this is done to represent text in a better way on the screen i.e. for better readability

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [None]:
print(wrap_text_preserve_newlines(str(documents[0])))

In [None]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

docs = text_splitter.split_documents(documents)

In [None]:
len(docs)

In [None]:
docs[3]

# Embeddings

In [None]:
# Embeddings

from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings()

In [None]:
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)

In [None]:
# this is not a RAG
# here , we are just accessing those docs from our database which are similar to our query.

query = "What did the president say about the Supreme Court"
docs = db.similarity_search(query)

In [None]:
print(wrap_text_preserve_newlines(str(docs[0].page_content)))

#QA Chain

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint

In [None]:
llm=HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", temperature=0.1, max_length=512)

In [None]:
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "What did the president say about the Supreme Court"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What did the president say about economy? just give 1 liner answer"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

#Working with PDFs Files

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

pdf_folder_path = '/content/gdrive/My Drive/pdfs_for_project_assignment_1/'
os.listdir(pdf_folder_path)

In [None]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
loaders

In [None]:
index = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

In [None]:
llm=HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2",
                        temperature=0.1,
                        max_length=512 )

In [None]:
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=index.vectorstore.as_retriever(),
                                    input_key="question")

In [None]:
chain.run('How was the GPT4all model trained?')

In [None]:
chain.run('How was the Mistral model trained?')

In [None]:
chain.run('what is the structuer used to build mistral model ')