### Data Ingestion

In [3]:
###Docu,emt structure
from langchain_core.documents import Document

In [5]:
doc=Document(
    page_content="This is the main content I am using to create RAG.",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Palak Makhija",
        "date_created":"16-10-2025"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Palak Makhija', 'date_created': '16-10-2025'}, page_content='This is the main content I am using to create RAG.')

In [6]:
## Create a simple text file
import os
os.makedirs("../data/text_files",exist_ok=True)

In [8]:
sample_texts={
    "../data/text_files/python_intro.txt":""" Python programming Introduction.
    Python is an interpreted, object-oriented, high-level programming language with dynamic semantics.
    Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, 
    as well as for use as a scripting or glue language to connect existing components together.
    Python's simple, easy to learn syntax emphasizes readability and therefore reduces the cost of program maintenance.
    Python supports modules and packages, which encourages program modularity and code reuse. 
    The Python interpreter and the extensive standard library are available in source or binary form without charge 
    for all major platforms, and can be freely distributed.

    Often, programmers fall in love with Python because of the increased productivity it provides.
    Since there is no compilation step, the edit-test-debug cycle is incredibly fast. 
    Python programs is easy: a bug or bad input will never cause a segmentation fault.
    Instead, when the interpreter discovers an error, it raises an exception.
    When the program doesn't catch the exception, the interpreter prints a stack trace.
    A source level debugger allows inspection of local and global variables, evaluation of arbitrary expressions,
    setting breakpoints, stepping through the code a line at a time, and so on.
    The debugger is written in Python itself, testifying to Python's introspective power.
    On the other hand, often the quickest way to debug a program is to add a few print 
    statements to the source: the fast edit-test-debug cycle makes this simple approach very effective.""",

        "../data/text_files/machine_learnig.txt":""" Machine Learning Basics
    Machine Learning is mainly divided into three core types: Supervised, Unsupervised and Reinforcement 
    Learning along with two additional types, Semi-Supervised and Self-Supervised Learning.

    Supervised Learning: Trains models on labeled data to predict or classify new, unseen data.
    Unsupervised Learning: Finds patterns or groups in unlabeled data, like clustering or dimensionality reduction.
    Reinforcement Learning: Learns through trial and error to maximize rewards, ideal for decision-making tasks."""
}
for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding='utf-8') as f:
        f.write(content)
print("✅ Sample text files created.")

✅ Sample text files created.


In [11]:
## TextLoader
from langchain.document_loaders import TextLoader
loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content=" Python programming Introduction.\n    Python is an interpreted, object-oriented, high-level programming language with dynamic semantics.\n    Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rapid Application Development, \n    as well as for use as a scripting or glue language to connect existing components together.\n    Python's simple, easy to learn syntax emphasizes readability and therefore reduces the cost of program maintenance.\n    Python supports modules and packages, which encourages program modularity and code reuse. \n    The Python interpreter and the extensive standard library are available in source or binary form without charge \n    for all major platforms, and can be freely distributed.\n\n    Often, programmers fall in love with Python because of the increased productivity it provides.\n    Since there is no comp

In [12]:
##Directory Loader
from langchain_community.document_loaders import DirectoryLoader
dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ##pattern to match files
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=False
)
documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learnig.txt'}, page_content=' Machine Learning Basics\n    Machine Learning is mainly divided into three core types: Supervised, Unsupervised and Reinforcement \n    Learning along with two additional types, Semi-Supervised and Self-Supervised Learning.\n\n    Supervised Learning: Trains models on labeled data to predict or classify new, unseen data.\n    Unsupervised Learning: Finds patterns or groups in unlabeled data, like clustering or dimensionality reduction.\n    Reinforcement Learning: Learns through trial and error to maximize rewards, ideal for decision-making tasks.'),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content=" Python programming Introduction.\n    Python is an interpreted, object-oriented, high-level programming language with dynamic semantics.\n    Its high-level built in data structures, combined with dynamic typing and dynamic binding, make it very attractive for Rap

In [None]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
##load all text files from directory
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", ##Pattern to match files
    loader_cls= PyMuPDFLoader, ##Loader class to use
    show_progress=False
)
pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Adobe Scan for Android 25.05.22-google-dynamic', 'creator': 'Adobe Scan for Android 25.05.22-google-dynamic', 'creationdate': '', 'source': '..\\data\\pdf\\compiler_design(notes) (1).pdf', 'file_path': '..\\data\\pdf\\compiler_design(notes) (1).pdf', 'total_pages': 69, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content=''),
 Document(metadata={'producer': 'Adobe Scan for Android 25.05.22-google-dynamic', 'creator': 'Adobe Scan for Android 25.05.22-google-dynamic', 'creationdate': '', 'source': '..\\data\\pdf\\compiler_design(notes) (1).pdf', 'file_path': '..\\data\\pdf\\compiler_design(notes) (1).pdf', 'total_pages': 69, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 1}, page_content=''),
 Document(metadata={'producer': 'Adobe Scan fo

: 