In [2]:
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os

loader = TextLoader("speech.txt")
document = loader.load()     # document creation has happened over here 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 30
)
docs = text_splitter.split_documents(document)
docs 

[Document(metadata={'source': 'speech.txt'}, page_content='Artificial intelligence is transforming the modern world in remarkable\nways. From healthcare diagnostics to financial forecasting, machine'),
 Document(metadata={'source': 'speech.txt'}, page_content='learning models help organizations make faster and more accurate\ndecisions. Developers use advanced tools and cloud platforms to build'),
 Document(metadata={'source': 'speech.txt'}, page_content='scalable applications that serve millions of users worldwide. Data plays\na crucial role in this ecosystem, as clean, structured information'),
 Document(metadata={'source': 'speech.txt'}, page_content='allows algorithms to learn meaningful patterns. Continuous learning,\nexperimentation, and innovation are essential for success in'),
 Document(metadata={'source': 'speech.txt'}, page_content='technology-driven industries. As automation expands, professionals must\nadapt by strengthening problem-solving skills, understanding system'),
 

In [7]:
import os

embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)
db  = FAISS.from_documents(docs,embeddings)
db 

<langchain_community.vectorstores.faiss.FAISS at 0x7f81ddef6250>

In [9]:
### Query 
query = "What is machine learning ?"
docs = db.similarity_search(query)
docs[0].page_content

'allows algorithms to learn meaningful patterns. Continuous learning,\nexperimentation, and innovation are essential for success in'

In [10]:
### Retriever
retriever=db.as_retriever()
retriever.invoke(query)

[Document(id='a5437496-8157-447e-9759-a6839a7712ed', metadata={'source': 'speech.txt'}, page_content='allows algorithms to learn meaningful patterns. Continuous learning,\nexperimentation, and innovation are essential for success in'),
 Document(id='3bcfeb96-9d67-4a6c-a729-92f1ac5ec297', metadata={'source': 'speech.txt'}, page_content='Artificial intelligence is transforming the modern world in remarkable\nways. From healthcare diagnostics to financial forecasting, machine'),
 Document(id='24135cd6-3d8a-4d52-bb9a-08089fdb5ece', metadata={'source': 'speech.txt'}, page_content='learning models help organizations make faster and more accurate\ndecisions. Developers use advanced tools and cloud platforms to build'),
 Document(id='f3ffc8d8-04a6-4a93-987a-cbe00ef2a406', metadata={'source': 'speech.txt'}, page_content='technology-driven industries. As automation expands, professionals must\nadapt by strengthening problem-solving skills, understanding system')]

In [None]:
docs_with_score = db.similarity_search_with_score(query)  # it will give us the documents based on the manhatan score so the least score is the one we need and that will only come at the top
docs_with_score

[(Document(id='a5437496-8157-447e-9759-a6839a7712ed', metadata={'source': 'speech.txt'}, page_content='allows algorithms to learn meaningful patterns. Continuous learning,\nexperimentation, and innovation are essential for success in'),
  np.float32(1.0037652)),
 (Document(id='3bcfeb96-9d67-4a6c-a729-92f1ac5ec297', metadata={'source': 'speech.txt'}, page_content='Artificial intelligence is transforming the modern world in remarkable\nways. From healthcare diagnostics to financial forecasting, machine'),
  np.float32(1.1002457)),
 (Document(id='24135cd6-3d8a-4d52-bb9a-08089fdb5ece', metadata={'source': 'speech.txt'}, page_content='learning models help organizations make faster and more accurate\ndecisions. Developers use advanced tools and cloud platforms to build'),
  np.float32(1.1399939)),
 (Document(id='f3ffc8d8-04a6-4a93-987a-cbe00ef2a406', metadata={'source': 'speech.txt'}, page_content='technology-driven industries. As automation expands, professionals must\nadapt by strengthenin

In [12]:
### Saving and Loading 
db.save_local("faiss_index")