In [None]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
text_documents = loader.load()
text_documents


In [None]:
## web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load, chunk and index the content of the web page
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-title", "post-content", "post-header")
        )
    ),
)
# please explain the above loader line

text_documents = loader.load()
text_documents
# also explain how simply writing text_documents prints it

In [None]:
## PDF Reader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("attention-is-all-you-need.pdf")
docs = loader.load()
docs

In [None]:
## Now we have done data ingestion, now let us convert this pdf doc into chunks. This falls inside the category of text splitter inside langchain.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents

In [13]:
## Now that we have the data in chunks, let us convert this into vectors.
## Vector Embeddings and Vector Stores

from langchain_community.embeddings import OllamaEmbeddings
 # OpenAI Embeddings >>> Ollama Embeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(
    documents,
    OllamaEmbeddings(model="deepseek-r1:14b"),
    persist_directory="./chroma_db",
)
db

<langchain_community.vectorstores.chroma.Chroma at 0x120b36850>

In [None]:
## Vector Database
query = "What is attention function?"
result= db.similarity_search(query, k=1)
result[0].page_content