# Document loaders
- Document Loaders are responsible for loading documents from a variety of sources.
- Reference https://python.langchain.com/docs/how_to/#document-loaders

In [19]:
#convert text to vectors
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')


In [20]:
#How to load PDFs

from langchain_community.document_loaders import PyPDFLoader

file_path = "D://GenAI//kalam.pdf"

loader = PyPDFLoader(file_path)
pages = []
for page in loader.load():
    pages.append(page)

print(pages)

[Document(metadata={'source': 'D://GenAI//kalam.pdf', 'page': 0}, page_content="Avul Pakir Jainulabdeen Abdul Kalam BR (/ˈəbdʊl kəˈlɑːm/ ⓘ; 15 October 1931 – 27 July \n2015) was an Indian aerospace scientist and statesman who served as the 11th president of \nIndia from 2002 to 2007. Born and raised in a Muslim family in Rameswaram, Tamil Nadu, \nhe studied physics and aerospace engineering. He spent the next four decades as a \nscientist and science administrator, mainly at the Defence Research and Development \nOrganisation (DRDO) and Indian Space Research Organisation (ISRO) and was intimately \ninvolved in India's civilian space programme and military missile development efforts.[2] He \nthus came to be known as the Missile Man of India for his work on the development \nof ballistic missile and launch vehicle technology.[3][4][5] He also played a pivotal \norganisational, technical, and political role in India's Pokhran-II nuclear tests in 1998, the first \nsince the original nucle

In [21]:
#How to load txt file

from langchain_community.document_loaders import TextLoader

file_path = "D://GenAI//kalam.txt"

loader = TextLoader(file_path)
pages = []

for page in loader.load():
    pages.append(page)

print(pages)


[Document(metadata={'source': 'D://GenAI//kalam.txt'}, page_content='The Defence Research and Development Organisation.\nDefence Research and Development in Ministry of Defence of the Government of India.The Defence Research and Development Organisation.\nDefence Research and Development in Ministry of Defence of the Government of India.The Defence Research and Development Organisation.\nDefence Research and Development in Ministry of Defence of the Government of India.\nThe Defence Research and Development Organisation.\nDefence Research and Development in Ministry of Defence of the Government of India.\nAPJ was born in india')]


In [22]:
#How to load web pages
from langchain_community.document_loaders import WebBaseLoader
import bs4

page_url = "https://python.langchain.com/docs/introduction/"

loader = WebBaseLoader(web_paths=[page_url],
                       bs_kwargs={
        "parse_only": bs4.SoupStrainer(class_="table-of-contents__link toc-highlight"),
    },
    bs_get_text_kwargs={"separator": " | ", "strip": True},)
docs = []
for doc in loader.load():
    docs.append(doc)

assert len(docs) == 1
doc = docs[0]

print(doc)

page_content='Architecture | Guides | Tutorials | How-to guides | Conceptual guide | Integrations | API reference | Ecosystem | 🦜🛠️ LangSmith | 🦜🕸️ LangGraph | Additional resources | Versions | Security | Contributing' metadata={'source': 'https://python.langchain.com/docs/introduction/'}


In [23]:
#https://python.langchain.com/docs/integrations/providers/arxiv/#installation-and-setup
#ArxivLoader is a tool used to fetch and load research papers from the arXiv database, which is a popular repository for academic papers in fields like physics, computer science, and mathematics. It allows users to retrieve papers in a structured format, enabling them to process and analyze the content programmatically.
#for more data source providers go through this link https://python.langchain.com/docs/integrations/providers/all/
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(
    query="reasoning"
)

docs = loader.get_summaries_as_docs()
print(docs[0].page_content[:100])
print(docs[0].metadata)

Large language models (LLMs) have demonstrated impressive reasoning
abilities, but they still strugg
{'Entry ID': 'http://arxiv.org/abs/2410.13080v1', 'Published': datetime.date(2024, 10, 16), 'Title': 'Graph-constrained Reasoning: Faithful Reasoning on Knowledge Graphs with Large Language Models', 'Authors': 'Linhao Luo, Zicheng Zhao, Chen Gong, Gholamreza Haffari, Shirui Pan'}


In [24]:
#load from wikipedia
from langchain_community.retrievers import WikipediaRetriever

retriever = WikipediaRetriever()
docs = retriever.invoke("Elon Musk")
print(docs[0].page_content[:100])



Elon Reeve Musk (; born June 28, 1971) is a businessman known for his key roles in the space company


# Data Transformation
- How to recursively split by characters

In [25]:
#https://python.langchain.com/docs/how_to/recursive_text_splitter/
#Let's go through the parameters set above for RecursiveCharacterTextSplitter:

#chunk_size: The maximum size of a chunk, where size is determined by the length_function.
#chunk_overlap: Target overlap between chunks. Overlapping chunks helps to mitigate loss of information when context is divided between chunks.
#length_function: Function determining the chunk size.
#is_separator_regex: Whether the separator list (defaulting to ["\n\n", "\n", " ", ""]) should be interpreted as regex.


from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load example document
with open("elon.txt", encoding="utf-8") as f:
    state_of_the_union = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([state_of_the_union])


In [26]:
#https://python.langchain.com/docs/how_to/character_text_splitter/
from langchain_text_splitters import CharacterTextSplitter

# Load example document
with open("elon.txt", encoding="utf-8") as f:
    state_of_the_union = f.read()

text_splitter = CharacterTextSplitter(
    separator="\t",
    chunk_size=10,
    chunk_overlap=5,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([state_of_the_union])


In [27]:
#https://python.langchain.com/docs/how_to/split_html/#overview-of-the-splitters
#How to: split HTML

#Choosing the Right Splitter
    #Use HTMLHeaderTextSplitter when:
        #You need to split an HTML document based on its header hierarchy and maintain metadata about the headers.
    
    #Use HTMLSectionSplitter when:
        #You need to split the document into larger, more general sections, possibly based on custom tags or font sizes.
    
    #Use HTMLSemanticPreservingSplitter when: 
        #You need to split the document into chunks while preserving semantic elements like tables and lists, ensuring that they are not split and that their context is maintained.

html_string = """
<!DOCTYPE html>
  <html lang='en'>
  <head>
    <meta charset='UTF-8'>
    <meta name='viewport' content='width=device-width, initial-scale=1.0'>
    <title>Fancy Example HTML Page</title>
  </head>
  <body>
    <h1>Main Title</h1>
    
    <h2>Section 1: Introduction</h2>
    <p>This section introduces the topic. Below is a list:</p>
    
    <h3>Subsection 1.1: Details</h3>
    <p>This subsection provides additional details. Here's a table:</p>
    
    <h2>Section 2: Media Content</h2>
    

    <h2>Section 3: Code Example</h2>
  
    <h2>Conclusion</h2>
  </body>
  </html>
 """

from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header-1"),
    ("h2", "Header-2"),
    ("h3", "Header-3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)


In [28]:
#method 2

from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

url="https://python.langchain.com/docs/how_to/split_html/#choosing-the-right-splitter"


html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)


In [29]:
#https://python.langchain.com/docs/how_to/recursive_json_splitter/
import json
from langchain_text_splitters import RecursiveJsonSplitter
import requests

# This is a large nested json object and will be loaded as a python dict
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()


splitter = RecursiveJsonSplitter(max_chunk_size=300)

# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=json_data)

#printing top 3 chunks
for chunk in json_chunks[:3]:
    print(chunk)

# The splitter can also output documents
docs = splitter.create_documents(texts=[json_data])

for doc in docs[:3]:
    print(doc)

#Or use .split_text to obtain string content directly:
texts = splitter.split_text(json_data=json_data)


{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'], 'summary': 'Read Tracer Session', 'description': 'Get a specific session.'}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'include_stats', 'in': 'query', 'required': False, 'schema': {'type': 'boolean', 'default': False, 'title': 'Include Stats'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}
page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/

# Data Embeddings

In [30]:
from langchain_openai import OpenAIEmbeddings

#https://platform.openai.com/docs/guides/embeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

text = "Hi this is shanmukh"
result = embeddings.embed_query(text)
print(result[0])

-0.001864203019067645


In [31]:
from langchain_openai import OpenAIEmbeddings
#https://platform.openai.com/docs/guides/embeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small",dimensions=199)
print(embeddings)


text = "Hi this is shanmukh"
result = embeddings.embed_query(text)
print(result)
print(len(result))


client=<openai.resources.embeddings.Embeddings object at 0x000001E0068AC1F0> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001E00AA1B7F0> model='text-embedding-3-small' dimensions=199 deployment='text-embedding-ada-002' openai_api_version=None openai_api_base=None openai_api_type=None openai_proxy=None embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True
[-0.00402143644168973, -0.11767087131738663, 0.051633257418870926, -0.00761921564117074, -0.03791734576225281, -0.1177767887711525, 0.08880920708179474, 0.11724721640348434, -0.08690274506807327, -0.05719376355409

In [32]:
from langchain_openai import OpenAIEmbeddings
from uuid import uuid4
from langchain_core.documents import Document
from langchain_chroma import Chroma



embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)


results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")



#k=1 specifies the number of results that the similarity search should return.
results = vector_store.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

#Search by vector
results = vector_store.similarity_search_by_vector(
    embedding=embeddings.embed_query("I love green eggs and ham!"), k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* [SIM=0.893613] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]
* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


In [33]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


file_path = "D://GenAI//kalam.txt"

loader = TextLoader(file_path)

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

final_docs = text_splitter.split_documents(docs)



embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Chroma(
    collection_name="example_collection1",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db1",  # Where to save data locally, remove if not necessary
)

vector_store.add_documents(final_docs)

results = vector_store.similarity_search(
    "Where is APJ Born",
    k=1,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")






* APJ was born in india [{'source': 'D://GenAI//kalam.txt'}]
