# Data Ingestion

In [4]:
from langchain_core.documents import Document

In [5]:
doc = Document(
    page_content="This is the content of the document.",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "John Doe",
        "date_created": "2024-06-15"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'John Doe', 'date_created': '2024-06-15'}, page_content='This is the content of the document.')

In [6]:
## create a simple txt file
import os
os.makedirs("../data/text_files", exist_ok=True)

In [13]:
sample_texts = {
    "../data/text_files/doc1.txt": """LangChain is a framework for developing applications powered by language models.
It enables developers to build applications that can understand and generate human-like text.
LangChain provides tools for prompt management, memory, and integration with various data sources.
    """,
    "../data/text_files/doc2.txt": """Python is a versatile programming language known for its readability and ease of use.
It is widely used in web development, data analysis, artificial intelligence, and scientific computing.
Python has a large standard library and a vibrant ecosystem of third-party packages.
    """,
}

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created.")

Sample text files created.


In [14]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/doc1.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/doc1.txt'}, page_content='LangChain is a framework for developing applications powered by language models.\nIt enables developers to build applications that can understand and generate human-like text.\nLangChain provides tools for prompt management, memory, and integration with various data sources.\n    ')]


In [16]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files", 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True
)

documents = dir_loader.load()
print(documents)

100%|██████████| 2/2 [00:00<00:00, 1570.31it/s]

[Document(metadata={'source': '..\\data\\text_files\\doc1.txt'}, page_content='LangChain is a framework for developing applications powered by language models.\nIt enables developers to build applications that can understand and generate human-like text.\nLangChain provides tools for prompt management, memory, and integration with various data sources.\n    '), Document(metadata={'source': '..\\data\\text_files\\doc2.txt'}, page_content='Python is a versatile programming language known for its readability and ease of use.\nIt is widely used in web development, data analysis, artificial intelligence, and scientific computing.\nPython has a large standard library and a vibrant ecosystem of third-party packages.\n    ')]





In [22]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf", 
    glob="**/*.pdf", 
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

pdf_documents = dir_loader.load()
pdf_documents


100%|██████████| 4/4 [00:00<00:00, 16.65it/s]


[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2022-11-08T01:54:42+00:00', 'source': '..\\data\\pdf\\2211.03533v1.pdf', 'file_path': '..\\data\\pdf\\2211.03533v1.pdf', 'total_pages': 12, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2022-11-08T01:54:42+00:00', 'trapped': '', 'modDate': 'D:20221108015442Z', 'creationDate': 'D:20221108015442Z', 'page': 0}, page_content='A Multi-task Model for Sentiment Aided Stance Detection of\nClimate Change Tweets\nApoorva Upadhyaya, Marco Fisichella, Wolfgang Nejdl\nL3S Research Center, Leibniz Universit¨at Hannover, Hannover, Germany\nupadhyaya@l3s.de, mﬁsichella@l3s.de, nejdl@l3s.de\nAbstract\nClimate change has become one of the biggest challenges of\nour time. Social media platforms such as Twitter play an im-\nportant role in raising public awareness and spreading knowl-\nedge about the dangers of the current climate crisis. With the\nincreasing nu

In [23]:
type(pdf_documents[0])

langchain_core.documents.base.Document