# TextLoader

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./examples/sql.md")
loader.load()

[Document(page_content="## 创建表\n\n```sql\n# 分区表\ncreate table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\n\n# orc表\nCREATE TABLE IF NOT EXISTS bank.account_orc (\n  `id_card` int,\n  `tran_time` string,\n  `name` string,\n  `cash` int\n  )\nstored as orc;\n```\n\n# 插入数据\n\n```sql\ninsert into tablename values('col1', 'col2');\n\n\nINSERT INTO table_name (column1, column2, column3)\nVALUES\n(value1, value2, value3),\n(value4, value5, value6),\n(value7, value8, value9);\n\n\nINSERT OVERWRITE TABLE tb\nselect * from tb2\n;\n```", metadata={'source': './examples/sql.md'})]

# CSVLoader

In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='./examples/test.csv')
loader.load()

[Document(page_content='id: 1\nname: 张三\ndegree: 本科', metadata={'source': './examples/test.csv', 'row': 0}),
 Document(page_content='id: 2\nname: 李四\ndegree: 硕士', metadata={'source': './examples/test.csv', 'row': 1})]

In [7]:
loader = CSVLoader(file_path='./examples/no_fields_name.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['id', 'name', 'degree']
    }, 
    source_column='id'
)

loader.load()

[Document(page_content='id: 1\nname: 张三\ndegree: 本科', metadata={'source': '1', 'row': 0}),
 Document(page_content='id: 2\nname: 李四\ndegree: 硕士', metadata={'source': '2', 'row': 1})]

# PyPDFLoader

In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("examples/sql.pdf")
pages = loader.load()
pages

Previous trailer can not be read ("invalid literal for int() with base 10: b'/Root'",)
Object 14 0 found
Object 3 0 found
Object 2 0 found
Object 5 0 found
Object 7 0 found
Object 21 0 found
Object 20 0 found
Object 22 0 found
Object 8 0 found
Object 9 0 found
Object 10 0 found
Object 30 0 found
Object 29 0 found
Object 31 0 found
Object 12 0 found
Object 35 0 found
Object 34 0 found
Object 4 0 found


[Document(page_content="创建表 \n插⼊数据 # 分区表create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';# orc表CREATE TABLE IF NOT EXISTS bank.account_orc ( \xa0`id_card` int, \xa0`tran_time` string, \xa0`name` string, \xa0`cash` int \xa0)stored as orc;insert into tablename values('col1', 'col2');INSERT INTO table_name (column1, column2, column3)VALUES(value1, value2, value3),(value4, value5, value6),(value7, value8, value9);INSERT OVERWRITE TABLE tbselect * from tb2;", metadata={'source': 'examples/sql.pdf', 'page': 0})]

# 自定义文档加载器

In [16]:
from typing import AsyncIterator, Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class CustomDocumentLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        with open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            for line in f:
                if not line.strip():
                    continue
                
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

    # alazy_load is OPTIONAL.
    # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
    async def alazy_load(
        self,
    ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
        """An async lazy loader that reads a file line by line."""
        # Requires aiofiles
        # Install with `pip install aiofiles`
        # https://github.com/Tinche/aiofiles
        import aiofiles

        async with aiofiles.open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            async for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

In [17]:
loader = CustomDocumentLoader('./examples/sql.md')
for doc in loader.lazy_load():
    print(type(doc), ' | ', doc)

<class 'langchain_core.documents.base.Document'>  |  page_content='## 创建表\n' metadata={'line_number': 0, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\n' metadata={'line_number': 1, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='# 分区表\n' metadata={'line_number': 2, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content="create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\n" metadata={'line_number': 3, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='# orc表\n' metadata={'line_number': 4, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='CREATE TABLE IF NOT EXISTS bank.account_orc (\n' metadata={'line_number': 5, 'source': './examples/sql.md'}
<class 'langchain_cor

In [18]:
async for doc in loader.alazy_load():
    print(type(doc), ' | ', doc)

<class 'langchain_core.documents.base.Document'>  |  page_content='## 创建表\n' metadata={'line_number': 0, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='\n' metadata={'line_number': 1, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\n' metadata={'line_number': 2, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='# 分区表\n' metadata={'line_number': 3, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content="create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\n" metadata={'line_number': 4, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='\n' metadata={'line_number': 5, 'source': './examples/sql.md'}
<class 'langchain_core.documents.base.Document'>  |  page_content='# orc