In [None]:
from langchain_core.documents import Document

### understanding document structures

In [None]:
doc = Document(
    page_content="This is the content of the document.",
    metadata={
        "source": "example_source.pdf",
          "page": 1,
          "author": "John Doe",
          "date_created": "2024-06-15"}
)
doc

Document(metadata={'source': 'example_source.pdf', 'page': 1, 'author': 'John Doe', 'date_created': '2024-06-15'}, page_content='This is the content of the document.')

In [None]:
### creating a simple text document
import os
os.makedirs("../data/text_files", exist_ok=True)
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

In [None]:
for filepath, content in sample_texts.items():
    with open(filepath, "w") as f:
        f.write(content)

print("Sample text documents created.")

Sample text documents created.


In [None]:
### using text loaders - for single text files
from langchain_community.document_loaders import TextLoader

In [None]:
loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

In [None]:
## using directory loader - for multiple text files
from langchain_community.document_loaders import DirectoryLoader

directory_loader = DirectoryLoader(
    "../data/text_files", glob="*.txt", 
    loader_cls=TextLoader, 
    loader_kwargs={"encoding": "utf-8"})
documents = directory_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '),
 Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popu

In [None]:
## using PDF loaders
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

pypdf = PyPDFLoader("../data/pdf_files/example1.pdf")
documents_pypdf = pypdf.load()
documents_pypdf

[Document(metadata={'producer': 'Prince 16 (www.princexml.com)', 'creator': 'PyPDF', 'creationdate': '', 'source': '../data/pdf_files/example1.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='USENIX Example Paper\nPekka Nikander\nAalto University\nJane-Ellen Long\nUSENIX Association\nAbstract\nThis is an example for a USENIX paper, in the form\nof an HTML/CSS template. Being heavily self -ref-\nerential, this template illustrates the f eatures in-\ncluded in this template. It is e xpected that the\nprospective authors using HTML/CSS would create\na ne w document based on this template, remo ve\nthe content, and start writing their paper.\nNote that in this template, you may have a mul-\nti-paragraph abstract. However, that it is no t nec-\nessarily a good prac tice. Try to k eep your abstract\nin one paragraph, and remember that the op timal\nlength for an abstract is 200-300 words.\n1 Introduction\nFor the purposes of USENIX c onference publica-\ntions, the authors

In [None]:
pypdf = PyMuPDFLoader("../data/pdf_files/example1.pdf", extract_images=True)
documents_pypdf = pypdf.load()
documents_pypdf

[Document(metadata={'producer': 'Prince 16 (www.princexml.com)', 'creator': '', 'creationdate': '', 'source': '../data/pdf_files/example1.pdf', 'file_path': '../data/pdf_files/example1.pdf', 'total_pages': 3, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='USENIX Example Paper\nPekka Nikander\nAalto University\nJane-Ellen Long\nUSENIX Association\nAbstract\nThis is an example for a USENIX paper, in the form\nof an HTML/CSS template. Being heavily self-ref-\nerential, this template illustrates the features in-\ncluded in this template. It is expected that the\nprospective authors using HTML/CSS would create\na new document based on this template, remove\nthe content, and start writing their paper.\nNote that in this template, you may have a mul-\nti-paragraph abstract. However, that it is not nec-\nessarily a good practice. Try to keep your abstract\nin one paragraph

In [None]:
from langchain_community.document_loaders import CSVLoader

pypdf = CSVLoader("../data/applications.csv",)
documents_pypdf = pypdf.load()
documents_pypdf

[Document(metadata={'source': '../data/applications.csv', 'row': 0}, page_content='company: revolute\ncountry: \njob posted date: \nclosing date: \napplication date: \nlink to description: \ncv: \ncoverletter: \napplication status: \napplied through: \nfound through: \ncomments: \nfeedback: '),
 Document(metadata={'source': '../data/applications.csv', 'row': 1}, page_content='company: aberdeen 1\ncountry: scotland\njob posted date: 26/10/2025\nclosing date: \napplication date: 28/10/2025\nlink to description: https://uk.whatjobs.com/jobs/graduate-data-scientist/aberdeen?id=346632208\ncv: CV_tharushi_suwaris_aberdeen1\ncoverletter: \napplication status: \napplied through: whatjob\nfound through: whatjob\ncomments: \nfeedback: '),
 Document(metadata={'source': '../data/applications.csv', 'row': 2}, page_content='company: aberdeen 2\ncountry: scotland\njob posted date: 26/10/2025\nclosing date: \napplication date: 28/10/2025\nlink to description: https://uk.whatjobs.com/jobs/graduate-data

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import os

#os.environ["USER_AGENT"] = "my-scraper/1.0"
pypdf = WebBaseLoader("https://pypdf.readthedocs.io/en/stable/user/extract-images.html")
documents_pypdf = pypdf.load()
documents_pypdf

