### Data Ingestion

In [21]:
### Document Structure 

from langchain_core.documents import Document

In [22]:
doc = Document(
    page_content= "this is the main text content I am using to crestr RAG",
    meta_data = {
        "source":"example.txt",
        "pages":1,
        "author":"M Saad",
        "date_created":"2025-11-7"
    } 
)

doc

Document(metadata={}, page_content='this is the main text content I am using to crestr RAG')

In [23]:
## create a simple text file

import os
os.makedirs("../data/text_files", exist_ok=True)

In [24]:
sample_texts = {
    "../data/text_files/python_intro.txt":"""Python Programing Introduction
    
    Python is a high-level, interpreted programming language known for its readability and versatility. 
    It supports multiple programming paradigms, including procedural, object-oriented, and functional programming.
    Python's extensive standard library and vibrant ecosystem of third-party packages make it suitable for a wide range of applications,
    from web development to data science and artificial intelligence. Created by Guido van Rossum and first released in 1991, 
    Python has become one of the most popular programming languages in the world.

    Key Features:
    -Easy to learn and use syntax
    -Extensive standard library
    -Cross-platform compatibility
    -Strong community support

    Python is widely used in various domains, including web development (Django, Flask), data scicence (Pandas, NumPy), artificial intelligence (TensorFlow, PyTorch), and automation.""",
       "../data/text_files/machine_learning_basics.txt":"""Machine Learning Basics

    
    Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that
    enable computers to perform specific tasks without explicit instructions. Instead, these systems learn from data and improve their p
    erformance over time.

    Types of Machine Learning:
    1. Supervised Learning: The model is trained on labeled data, where the input-output pairs are provided.
    2. Unsupervised Learning: The model is trained on unlabeled data and must find patterns or structures within the data.
    3. Reinforcement Learning: The model learns by interacting with an environment and receiving feedback in the form of rewards or penalties.

    Applications include image recognition, natural language processing, recommendation systems, and autonomous vehicles.

       """
}


for filepath,content in sample_texts.items():
    with open (filepath,"w",encoding = "utf-8") as f:
        f.write(content)

print("Sample text files created!!.")

Sample text files created!!.


Note: Above cell check weather Machine_learning and python_intro file exists or not. If not then it creates inside data/text_files folder

In [25]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document = loader.load()


In [26]:
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="Python Programing Introduction\n\n    Python is a high-level, interpreted programming language known for its readability and versatility. \n    It supports multiple programming paradigms, including procedural, object-oriented, and functional programming.\n    Python's extensive standard library and vibrant ecosystem of third-party packages make it suitable for a wide range of applications,\n    from web development to data science and artificial intelligence. Created by Guido van Rossum and first released in 1991, \n    Python has become one of the most popular programming languages in the world.\n\n    Key Features:\n    -Easy to learn and use syntax\n    -Extensive standard library\n    -Cross-platform compatibility\n    -Strong community support\n\n    Python is widely used in various domains, including web development (Django, Flask), data scicence (Pandas, NumPy), artificial intelligence (TensorFlo

In [27]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ##pattern to match files
    loader_cls=TextLoader,
    loader_kwargs= {'encoding':'utf-8'},
    show_progress= False

)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning_basics.txt'}, page_content='Machine Learning Basics\n\n\n    Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that\n    enable computers to perform specific tasks without explicit instructions. Instead, these systems learn from data and improve their p\n    erformance over time.\n\n    Types of Machine Learning:\n    1. Supervised Learning: The model is trained on labeled data, where the input-output pairs are provided.\n    2. Unsupervised Learning: The model is trained on unlabeled data and must find patterns or structures within the data.\n    3. Reinforcement Learning: The model learns by interacting with an environment and receiving feedback in the form of rewards or penalties.\n\n    Applications include image recognition, natural language processing, recommendation systems, and autonomous vehicles.\n\n       '),
 Document(metadata={'so

## Loading PDF Files

In [29]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "../data/pdf_files",
    glob="**/*.pdf", ##pattern to match files
    loader_cls=PyMuPDFLoader,  ##loader class to use
    show_progress= False

)

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-10-31T16:05:18+05:30', 'source': '..\\data\\pdf_files\\Al Rouf.pdf', 'file_path': '..\\data\\pdf_files\\Al Rouf.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': '', 'author': 'Saad Fahim', 'subject': '', 'keywords': '', 'moddate': '2025-10-31T16:05:18+05:30', 'trapped': '', 'modDate': "D:20251031160518+05'30'", 'creationDate': "D:20251031160518+05'30'", 'page': 0}, page_content="Al Rouf Lighting Technology Co. Ltd. \nOfficial Company Profile (Internal Document) \nPage 1: About Al Rouf & Our Identity \nOur Company \nEstablished in 2014, Al Rouf Lighting Technology Co. Ltd. is a leading, ISO 9001-2015 QMS \nand ISO 45001:2018 HSE certified company based in the Kingdom of Saudi Arabia. We are a \npremier provider of comprehensive, turnkey lighting solutions. \nOur commitment to quality and innovation has culminated in the establishment of the Alro

In [30]:
type(pdf_documents[0])

langchain_core.documents.base.Document

## Loading CSV Files

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path

csv_path = Path("../data/csv_files/steamcharts.csv")

loader = CSVLoader(
    file_path=str(csv_path),
    csv_args={"delimiter": ",", "quotechar": '"'},  # change if ; or \t
    encoding="utf-8", 
)

data = loader.load()
data

[Document(metadata={'source': '..\\data\\csv_files\\steamcharts.csv', 'row': 0}, page_content='month: Sep-25\navg_players: 7805.25\ngain: 883.12\ngain_percent: 0.1276\npeak_players: 13254\nname: Counter-Strike\nsteam_appid: 10'),
 Document(metadata={'source': '..\\data\\csv_files\\steamcharts.csv', 'row': 1}, page_content='month: Aug-25\navg_players: 6922.13\ngain: -449.35\ngain_percent: -0.061\npeak_players: 12168\nname: Counter-Strike\nsteam_appid: 10'),
 Document(metadata={'source': '..\\data\\csv_files\\steamcharts.csv', 'row': 2}, page_content='month: Jul-25\navg_players: 7371.48\ngain: -833.5\ngain_percent: -0.1016\npeak_players: 13951\nname: Counter-Strike\nsteam_appid: 10'),
 Document(metadata={'source': '..\\data\\csv_files\\steamcharts.csv', 'row': 3}, page_content='month: Jun-25\navg_players: 8204.98\ngain: -847.53\ngain_percent: -0.09359999999999999\npeak_players: 15798\nname: Counter-Strike\nsteam_appid: 10'),
 Document(metadata={'source': '..\\data\\csv_files\\steamcharts