1️⃣ Install Dependencies

In [None]:
! pip install langchain faiss-cpu sentence-transformers google-generativeai langchain-community

2️⃣ Data Ingestion from a Text File

In [None]:
from langchain_community.document_loaders import TextLoader

# Load text data from a file
data_loader = TextLoader("data.txt")
documents = data_loader.load()
print("--------documents---------",documents)

3️⃣ Data Chunking

In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define a chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50
)
chunks = text_splitter.split_documents(documents)

chunks

[Document(metadata={'source': 'data.txt'}, page_content='Natural Language Processing (NLP) is a subfield of Artificial Intelligence that focuses on the interaction between computers and human language. It enables machines to understand, interpret, and generate human language in a way that is both meaningful and useful.\n\nKey components of NLP include tokenization, part-of-speech tagging, named entity recognition, sentiment analysis, and syntactic parsing. These techniques allow NLP models to process large volumes of text efficiently.'),
 Document(metadata={'source': 'data.txt'}, page_content='One of the most widely used applications of NLP is machine translation, where algorithms like Google Translate convert text from one language to another. Another key application is text summarization, which helps extract key information from lengthy documents.'),
 Document(metadata={'source': 'data.txt'}, page_content='Recent advancements in NLP have been driven by deep learning models such as tr

4️⃣ Store Embeddings in Vector Store

In [38]:
! pip install --upgrade langchain langchain-google-genai google-generativeai faiss-cpu sentence-transformers

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
import pickle

# ✅ Use HuggingFaceEmbeddings instead of raw SentenceTransformer
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Sample documents (Replace this with actual data)
# documents = [
#     {"text": "Natural Language Processing (NLP) enables computers to understand human language."},
#     {"text": "Sentence Transformers generate vector embeddings for text similarity tasks."},
#     {"text": "FAISS (Facebook AI Similarity Search) efficiently searches large vector spaces."}
# ]

# ✅ Convert documents to LangChain format
from langchain.schema import Document
# chunks = [Document(page_content=doc["text"]) for doc in documents]

# ✅ Create FAISS vector store
vector_store = FAISS.from_documents(chunks, embedding_model)

# ✅ Save FAISS index and documents separately
vector_store.save_local("faiss_index")
with open("faiss_docs.pkl", "wb") as f:
    pickle.dump(documents, f)

print("FAISS index and documents stored successfully.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-google-genai
  Using cached langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.6.16-py3-none-any.whl.metadata (5.7 kB)
INFO: pip is looking at multiple versions of google-generativeai to determine which version is compatible with other requirements. This could take a while.
Collecting google-generativeai
  Using cached google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
  Using cached google_generativeai-0.8.3-py3-none-any.whl.metadata (3.9 kB)
  Using cached google_generativeai-0.8.2-py3-none-any.whl.metadata (3.9 kB)
  Using cached google_generativeai-0.8.1-py3-none-any.whl.metadata (3.9 kB)
INFO: pip is still looking at multiple versions of google-generativeai to determine which version is compatible with other requirements. This could take a while.
  Using cached google_generativeai-0.8.0-py3-none-any.whl.meta

5️⃣ Query Retrieval

In [39]:
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from dotenv import load_dotenv

# ✅ Load environment variables from .env file
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
# ✅ Load FAISS index
# vector_store = FAISS.load_local("faiss_index", embedding_model)
# ✅ Load FAISS index with safe deserialization
vector_store = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)


# ✅ Initialize retriever
retriever = vector_store.as_retriever()

# ✅ Configure Gemini API
genai.configure(api_key=GEMINI_API_KEY)
# ✅ Load Google Gemini LLM
def query_gemini(query, context):
    """Uses Gemini Pro to answer questions based on retrieved context."""
    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}"
    response = model.generate_content(prompt)
    return response.text

# ✅ Perform Retrieval & Answer Generation
query = "What is NLP?"
retrieved_docs = retriever.get_relevant_documents(query)
context = "\n".join([doc.page_content for doc in retrieved_docs])

# ✅ Generate response using Gemini
response = query_gemini(query, context)
print(f"\n🔹 **Q:** {query}\n🔹 **A:** {response}")



🔹 **Q:** What is NLP?
🔹 **A:** Based on the provided text, Natural Language Processing (NLP) is a subfield of Artificial Intelligence that focuses on the interaction between computers and human language.  It enables machines to understand, interpret, and generate human language in a meaningful and useful way.

