In [2]:
### Document Structure

from langchain_core.documents import Document

In [3]:
doc=Document(
    page_content="This is the content of the document.",
    metadata={
        "source": "example.txt",
        "page": 1,
        "author": "Sonu Kumar",
        "created_date": "2026-02-13"
    }
)
doc

Document(metadata={'source': 'example.txt', 'page': 1, 'author': 'Sonu Kumar', 'created_date': '2026-02-13'}, page_content='This is the content of the document.')

In [5]:
# Create a simple text file

import os
os.makedirs("data/test_files", exist_ok=True)

In [7]:
sample_texts={
    "data/text_files/python_intro.txt": """Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python's design philosophy emphasizes code readability and allows programmers to express concepts in fewer lines of code compared to other languages.
    Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It has a large standard library that provides a wide range of modules and functions for various tasks, making it a versatile language for web development, data analysis, artificial intelligence, scientific computing, and more.
    """
}

for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created successfully.")        

Sample text files created successfully.


In [10]:
### TextLoader
# from langchain_core.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader=TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content="Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python's design philosophy emphasizes code readability and allows programmers to express concepts in fewer lines of code compared to other languages.\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It has a large standard library that provides a wide range of modules and functions for various tasks, making it a versatile language for web development, data analysis, artificial intelligence, scientific computing, and more.\n    ")]


In [12]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader=DirectoryLoader(
    "data/text_files", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
    )
documents=dir_loader.load()
documents

[Document(metadata={'source': 'data\\text_files\\first_file.txt'}, page_content="Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python's design philosophy emphasizes code readability and allows programmers to express concepts in fewer lines of code compared to other languages.\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It has a large standard library that provides a wide range of modules and functions for various tasks, making it a versatile language for web development, data analysis, artificial intelligence, scientific computing, and more.\n    "),
 Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content="Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python's d

In [14]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader=DirectoryLoader(
    "data/pdf", 
    glob="*.pdf", 
    loader_cls=PyMuPDFLoader,
    show_progress=False
    )
pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-04-12T18:39:30+05:30', 'source': 'data\\pdf\\61076471_1.pdf', 'file_path': 'data\\pdf\\61076471_1.pdf', 'total_pages': 17, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-12T18:39:30+05:30', 'trapped': '', 'modDate': "D:20240412183930+05'30'", 'creationDate': "D:20240412183930+05'30'", 'page': 0}, page_content='___________________________________________________________________________ \n \nLTIMindtree Limited is a subsidiary of Larsen & Toubro Limited \n \n©LTIMindtree | Confidential 2024 \nSpecialist - Service \nSpecialist - Service Design \nSeptember 12, 2022 \nCustomer Success Team (CST) BU \nDATE: April 12, 2024 \nRef: LTIMindtree/HR/EDGE/2024 \n \nEmployee Name : Sonu . \nPS Number          : 61076471 \nCountry                : India \n \n \nDear Sonu ., \n \n \nEmployment Agreement \n \nWe ar