In [1]:
%%capture
!pip install  tiktoken faiss-cpu
!pip install -U sentence-transformers
!pip install langchain
!pip install pypdf
!pip install langchain-huggingface

In [3]:
import os
from dotenv import dotenv_values
config = dotenv_values(".env")
#print(config)
os.environ["GROQ_API_KEY"] = config["GROQ_API_KEY"]


# 🔍 **Retrieval in LangChain Explained**

<img src="https://python.langchain.com/assets/images/data_connection-95ff2033a8faa5f3ba41376c0f6dd32a.jpg">
<img src="https://www.researchgate.net/publication/381125820/figure/fig2/AS:11431281249185289@1717499737731/Illustration-of-a-Retrieval-Augmented-Generation-RAG-workflow-Documents-are-loaded-and.ppm">

### 🌐 **Basic Concept**

Retrieval is like gathering resources to enhance an essay, helping language models access up-to-date, relevant information beyond their built-in knowledge.

💡 **Advantages**:
   - Adds new, fresh information.
   - Makes responses more relevant and informed.

📚 **Document Loaders**:
   - Function as "specialized librarians."
   - Organize content from various sources for language models.

📄 **Text Loader Fundamentals**:
   - Simple process: Converts text files into a usable format for language models.


# 🔄 **Document Loaders in LangChain**:

📋 **Wide Selection**: Numerous document loaders available. Check the [documentation](https://github.com/langchain-ai/langchain/tree/master/libs/langchain/langchain/document_loaders) for a full list.

👣 **Usage Steps**:
   1. Choose a Document Loader from LangChain.
   2. Create an instance of the Document Loader.
   3. Employ its `load()` method to convert files into LangChain documents.

### 🛠️ **Role of Document Transformers**

📐 **Customization for Models**: Adjust documents to suit your model's requirements, like trimming lengthy texts.

### ✂️ **Understanding Text Splitters**

🔢 **Function**: Divide long texts into smaller, coherent segments.

🔗 **Goal**: Keep related text together, fitting within the model's capacity.

### 🧩 **Using `RecursiveCharacterTextSplitter`**

🔄 **Methodology**:
   - Intelligently splits texts using multiple separators.

   - Recursively adjusts if segments are too large.

   - Ensures all parts are appropriately sized.

### 🌟 **Key Aspects of Splitting**

   - Chooses optimal separators for division.

   - Continually splits large chunks.

   - Balances chunk size by characters or tokens.

   - Maintains some overlap for context.

   - Tracks chunk starting points if needed.



In [4]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader=PyPDFDirectoryLoader("./ncs_docs")
docs=loader.load() ## Document Loading  (1GB file)



In [None]:
print(type(docs))
print(type(docs[0]))
len(docs)

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap = 200)
final_documents=splitter.split_documents(docs[:100]) #splitting
print(type(final_documents))
print(type(final_documents[0]))
print(len(final_documents))
print(final_documents[41])
print("************************")
print(final_documents[42])

# 🌐 **Text Embeddings Overview**

🔢 **Functionality**: Converts documents into numerical vectors in LangChain.

🤝 **Similarity Measure**: Vectors that are closer indicate more similar texts.

🔍 **Application**: Quickly identify documents with similar topics or content.



In [None]:
#from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-mpnet-base-v2"
#modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
HuggingFaceembeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

# 🛠️ **Creating a Vector Store Retriever**

1. **Load Documents**: Utilize a document loader for initial document retrieval.

2. **Split Texts**: Break down documents into smaller sections with a text splitter.

3. **Embedding Conversion**: Apply an embedding model to transform text chunks into vectors.

4. **Vector Store Creation**: Compile these vectors into a vector store.

🔍 **Outcome**: Your vector store is now set up to search and retrieve texts by content.

In [8]:
from langchain.vectorstores import FAISS

vectorstore = FAISS.from_documents(documents=final_documents, embedding=HuggingFaceembeddings)

# 🔎 **Vector Store as a Retriever**

1. **Search Engine Role**: The vector store functions like a document search engine.

2. **Similarity Searches**: Find documents similar to your provided text.

3. **Customization Options**: Specify match selectivity and desired number of top results.

✨ **Functionality**: Use `similarity_search` to pinpoint documents closely matching your specified text, with flexibility in refining search parameters.

In [None]:
query = "NCS command line"

vectorstore.similarity_search(query)

# Generate

In [None]:
from langchain.chains import RetrievalQA

from langchain.prompts import PromptTemplate

from langchain_groq import ChatGroq

template = """

Use the following pieces of context to answer the question at the end.

If you don't know the answer, just say 'Ah snap homie, I ain't gonna front. I don't know.`, don't try to make up an answer.

Use three sentences maximum, relevant analogies, and keep the answer as concise as possible.

Use the active voice, and speak directly to the reader using concise language.
{context}

Question: {question}

Helpful Answer:

"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

llm = ChatGroq(model="llama3-8b-8192",temperature=0.7)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


query = "can you give an example for NCS command line"


result = qa_chain.invoke({"query": query})
#print(type(result))
result["result"]