### Data Ingestion

In [1]:
### Document Structure

from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content="This is the content of the document I am using to create RAG.",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "Rohan",
        "date_created": "2025-10-05"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Rohan', 'date_created': '2025-10-05'}, page_content='This is the content of the document I am using to create RAG.')

In [3]:
## Create a simple txt file
import os 
os.makedirs("../data/text_files", exist_ok=True)

In [4]:
sample_texts = {
    "../data/text_files/python_intro.txt": """Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms, 
including procedural, object-oriented, and functional programming.  

Key Features of Python:
- Easy to Learn and Use: Python's syntax is clear and concise, making it an excellent choice for beginners.
- Extensive Standard Library: Python comes with a vast standard library that provides modules and functions for various tasks, such as file I/O, regular expressions, and web development.
- Cross-Platform Compatibility: Python is available on various operating systems, including Windows, macOS, and Linux.
- Large Community: Python has a vibrant community that contributes to a rich ecosystem of third-party libraries
"""
}

for filepath, content in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created successfully.")

Sample text files created successfully.


In [5]:
### TextLoader
from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()  # List of Document objects
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms, \nincluding procedural, object-oriented, and functional programming.  \n\nKey Features of Python:\n- Easy to Learn and Use: Python's syntax is clear and concise, making it an excellent choice for beginners.\n- Extensive Standard Library: Python comes with a vast standard library that provides modules and functions for various tasks, such as file I/O, regular expressions, and web development.\n- Cross-Platform Compatibility: Python is available on various operating systems, including Windows, macOS, and Linux.\n- Large Community: Python has a vibrant community that contributes to a rich ecosystem of third-party libraries\n")]


In [6]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## Load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ## Pattern to match files
    loader_cls=TextLoader, ## Loader class to use for loading files
    loader_kwargs={"encoding": "utf-8"}, ## Additional arguments for the loader class
    show_progress=True
)
documents = dir_loader.load()
documents

100%|██████████| 1/1 [00:00<?, ?it/s]


[Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content="Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nIt was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms, \nincluding procedural, object-oriented, and functional programming.  \n\nKey Features of Python:\n- Easy to Learn and Use: Python's syntax is clear and concise, making it an excellent choice for beginners.\n- Extensive Standard Library: Python comes with a vast standard library that provides modules and functions for various tasks, such as file I/O, regular expressions, and web development.\n- Cross-Platform Compatibility: Python is available on various operating systems, including Windows, macOS, and Linux.\n- Large Community: Python has a vibrant community that contributes to a rich ecosystem of third-party libraries\n")]

In [7]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader