In [None]:
## Data Ingestion

from langchain_community.document_loaders import TextLoader

loader=TextLoader("/content/drive/My Drive/speech.txt")
text_documents = loader.load()
text_documents

In [None]:
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [None]:
## Data ingestion from Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4
## load,chunk and index the content of the html page
loader = WebBaseLoader(web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/"),
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("post-title","post-content","post-header")
                       )))

web_base_documents =loader.load()
web_base_documents


In [None]:
## data Ingestion from PDF 
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/drive/My Drive/aws-seller-faq.pdf")

pdf_documents = loader.load()
pdf_documents

In [None]:
## Combining all three data ingestion techniques

all_docs = text_documents + pdf_documents + web_base_documents


Now we have to Transform the text that is to chunk the input and create embedding to store in vector DB

In [None]:
## Transform

from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents_all = text_splitter.split_documents(all_docs)


In [None]:
##Huggingface embeddings to store in chroma

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db = Chroma.from_documents(all_docs, embedding)

In [None]:
## OpenAIEmbedding (openAI_API_key) to store in chroma

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embedding = OpenAIEmbeddings(model="text-embedding-3-small")

db = Chroma.from_documents(all_docs, embedding)

In [None]:
## query 1 - Vector Database query search
query = "What does aws means "
result=db.similarity_search(query)
result[0].page_content

In [None]:
## query 2 - Vector Database quesry search 
query = "What are autonomous agents  "
result=db.similarity_search(query)
result[0].page_content

In [None]:
## Vector Embedding and Vector Store

from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db=Chroma.from_documents(documents[:10],OpenAIEmbeddings())

In [None]:

## Vector Database search 
query = "What does aws means "
result=db.similarity_search(query)
result[0].page_content