# RAG

#### this Notebook has the following
1. document loading
2. embedding
3. vector storage
4. retreival

In [None]:
# pip install langchain-community

In [None]:
import os
os.environ['OPENAI_API_KEY'] = ""

### Document Loading

#### Text

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./files_for_RAG/LangchainRetrieval.txt")
loader.load()

#### PDF

In [None]:
# !pip install pypdf
# !pip install unstructured

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./files_for_RAG/Excel_Course_Document.pdf")
pages = loader.load_and_split()

In [None]:
pages[1]

#### Load Directories

In [None]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader('./files_for_RAG/', glob="**/*.txt")
docs = loader.load()

print(f"Count documents : {len(docs)}")
docs[1]

In [None]:
loader = DirectoryLoader('./files_for_RAG/', glob="**/*.txt", show_progress=True)
docs = loader.load()

#### Load CSv

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='./files_for_RAG/Movie_collection_dataset.csv')
data = loader.load()

In [None]:
print(data)

In [None]:
loader = CSVLoader(file_path='./files_for_RAG/Movie_collection_dataset.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['Genre', 'Budget', 'Actor_rating']
})

data = loader.load()

### Splitting the document - Chunking

#### Recursively split by character

In [None]:
# %pip install -qU langchain-text-splitters

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./files_for_RAG/LangchainRetrieval.txt")
text = loader.load()

In [None]:
text

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)

In [None]:
texts = text_splitter.split_documents(text)
print(texts[0])
print(texts[1])
print(texts[2])

### Embedding

#### OpenAI embedding

In [None]:
# pip install langchain-openai

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [None]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi",
        "What's up!",
        "Learning LangChain"
    ]
)
len(embeddings), len(embeddings[0])

In [None]:
embeddings[0]

In [None]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

#### Huggingface embeddings

In [None]:
# !pip install -U sentence-transformers
# !pip install -U langchain-huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize instructor embeddings using the Hugging Face model
embeddings_model = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")

### Vector Storage

#### Chroma

In [None]:
# !pip install langchain-chroma


In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader("./files_for_RAG/LangchainRetrieval.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [None]:
query = "What is text embedding and how does langchain help in doing it"
docs = db.similarity_search(query)
print(docs[1].page_content)

In [None]:
embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

In [None]:
#####FAISS

In [None]:
# pip install faiss-cpu

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader("./files_for_RAG/LangchainRetrieval.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())

## Retrievers

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader("./files_for_RAG/LangchainRetrieval.txt").load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [None]:
retriever = db.as_retriever()

In [None]:
docs = retriever.invoke("What is text embedding and how does langchain help in doing it")

In [None]:
len(docs)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke("What is text embedding and how does langchain help in doing it")


In [None]:
retriever = db.as_retriever(search_kwargs={"k": 1})

In [None]:
docs = retriever.invoke("What is text embedding and how does langchain help in doing it")

In [None]:
docs

In [None]:
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8}
)

In [None]:
docs = retriever.invoke("What is text embedding and how does langchain help in doing it")

In [None]:
docs