<a href="https://colab.research.google.com/github/Tuhinm2002/rag_llm/blob/main/rag_llm_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install langchain_community langchain-huggingface langchain -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m87.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.3/423.3 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m109.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
! pip install faiss-cpu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import faiss
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface import HuggingFaceEndpoint
from langchain.text_splitter import CharacterTextSplitter
import numpy as np


In [22]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"

In [5]:
with open('text_merged.txt','r',encoding='utf-8') as file_reader:
  text = file_reader.read()
  file_reader.close()

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = text_splitter.split_text(text)



In [10]:
hf_embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-mpnet-base-v2",
    model_kwargs = {'device':'cuda'},
    encode_kwargs = {'normalize_embeddings':False}
)

In [11]:
embeddings = hf_embeddings.embed_documents(chunks)
dimension = len(embeddings[0])

In [12]:
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, 100)
index.train(np.array(embeddings).astype('float32'))
index.add(np.array(embeddings).astype('float32'))

In [13]:
from langchain.schema import Document

documents = [Document(page_content=chunk) for chunk in chunks]
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

vector_store = FAISS(embedding_function=hf_embeddings.embed_query,index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

# Step 4: Save index
vector_store.save_local('vector_search')



In [24]:
llm = HuggingFaceEndpoint(
    repo_id = "mistralai/Mistral-7B-Instruct-v0.2",
    task = "text-generation",
    temperature=0.6
)

In [35]:
vector_search = FAISS.load_local("/content/vector_search/",embeddings=hf_embeddings,
                                 allow_dangerous_deserialization=True)
query = "what is rig veda"

In [36]:
qa = RetrievalQA.from_chain_type(llm=llm,retriever=vector_search.as_retriever())
answer = qa({'query':query})
print(answer)



{'query': 'what is rig veda', 'result': ' The Rigveda is the oldest and most authoritative of the four Vedas in Hinduism. It is a collection of hymns, prayers, and rituals, composed in the ancient Indo-European language of Sanskrit. It is considered the foundational text of Hinduism and contains references to various gods and goddesses, as well as mythological stories and philosophical ideas. The Rigveda is also a rich source of information about ancient Indian society, culture, and history.'}
