DATA INGESTION

In [1]:
###Document Structure

from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content= "This is the main content I am using to create RAG",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Bhatia Tanuj",
        "date_created":"24/11/2025"
    }
)

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Bhatia Tanuj', 'date_created': '24/11/2025'}, page_content='This is the main content I am using to create RAG')

In [3]:
##Create a simple txt file
import os 
os.makedirs("../data/text_files",exist_ok=True)

In [4]:
sample_text = {
    "../data/text_files/python_intro.txt": """Python Programming Introduction
    Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van 
    Rossum and first released in 1991, emphasizing code readability through clear syntax and indentation.

Key Features of Python
Easy to Learn and Use: Python's syntax is simple and resembles natural language, making it accessible for beginners.

Versatile and Flexible: Used for web development, data analysis, machine learning, automation, scripting, and more.

Rich Libraries and Frameworks: Has a vast standard library and numerous third-party packages for various tasks.

Cross-platform: Runs seamlessly on Windows, macOS, Linux, and other operating systems.

Interpreted Language: Executes code line-by-line, which simplifies debugging and development.

Why Python Is Popular
Its syntax reduces the complexity of writing code.

Large community support and extensive documentation.

Suitable for both small scripts and large-scale enterprise applications.

Widely used in emerging technologies like AI, data science, and automation.

In essence, Python’s combination of simplicity, power, and community support makes it one of the most popular programming languages today.

"""
}

for filepath, content in sample_text.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("Sample text files created!!")

Sample text files created!!


In [5]:
###TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="Python Programming Introduction\n    Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van \n    Rossum and first released in 1991, emphasizing code readability through clear syntax and indentation.\n\nKey Features of Python\nEasy to Learn and Use: Python's syntax is simple and resembles natural language, making it accessible for beginners.\n\nVersatile and Flexible: Used for web development, data analysis, machine learning, automation, scripting, and more.\n\nRich Libraries and Frameworks: Has a vast standard library and numerous third-party packages for various tasks.\n\nCross-platform: Runs seamlessly on Windows, macOS, Linux, and other operating systems.\n\nInterpreted Language: Executes code line-by-line, which simplifies debugging and development.\n\nWhy Python Is Popular\nIts syntax reduces the complexity of writing code.\n\

In [8]:
###DieectoryLoader

from langchain_community.document_loaders import DirectoryLoader

###Load all .txt files from the directory
loader = DirectoryLoader("../data/text_files", 
                         glob="*.txt", 
                         loader_cls=TextLoader, 
                         loader_kwargs={"encoding":"utf-8"}
                        )
documents = loader.load()
print(documents)

[Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content="Python Programming Introduction\n    Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van \n    Rossum and first released in 1991, emphasizing code readability through clear syntax and indentation.\n\nKey Features of Python\nEasy to Learn and Use: Python's syntax is simple and resembles natural language, making it accessible for beginners.\n\nVersatile and Flexible: Used for web development, data analysis, machine learning, automation, scripting, and more.\n\nRich Libraries and Frameworks: Has a vast standard library and numerous third-party packages for various tasks.\n\nCross-platform: Runs seamlessly on Windows, macOS, Linux, and other operating systems.\n\nInterpreted Language: Executes code line-by-line, which simplifies debugging and development.\n\nWhy Python Is Popular\nIts syntax reduces the complexity of writing code.

In [16]:
from langchain_community.document_loaders import PyMuPDFLoader

###PDF Loader
###Load all .txt files from the directory
loader = DirectoryLoader("../data/pdf", 
                         glob="*.pdf", 
                         loader_cls=PyMuPDFLoader
                        )
documents = loader.load()
documents

[Document(metadata={'producer': 'Microsoft® PowerPoint® 2019', 'creator': 'Microsoft® PowerPoint® 2019', 'creationdate': '2025-11-24T18:05:36+05:30', 'source': '..\\data\\pdf\\Computer_vision_narrated.pdf', 'file_path': '..\\data\\pdf\\Computer_vision_narrated.pdf', 'total_pages': 4, 'format': 'PDF 1.7', 'title': 'Computer Vision Presentation', 'author': 'Käch, Timon', 'subject': '', 'keywords': '', 'moddate': '2025-11-24T18:05:36+05:30', 'trapped': '', 'modDate': "D:20251124180536+05'30'", 'creationDate': "D:20251124180536+05'30'", 'page': 0}, page_content='COMPUTER \nVISION \nPRESENTATION'),
 Document(metadata={'producer': 'Microsoft® PowerPoint® 2019', 'creator': 'Microsoft® PowerPoint® 2019', 'creationdate': '2025-11-24T18:05:36+05:30', 'source': '..\\data\\pdf\\Computer_vision_narrated.pdf', 'file_path': '..\\data\\pdf\\Computer_vision_narrated.pdf', 'total_pages': 4, 'format': 'PDF 1.7', 'title': 'Computer Vision Presentation', 'author': 'Käch, Timon', 'subject': '', 'keywords': 