# LangChain Document Loaders

LangChain has many document loaders, some of them are listed below 


<p align="center">
<img src="image1.png" />
</p>

## PDF

In [8]:
# %pip install pypdf
# %pip install langchain
# %pip install langchain_community

In [9]:
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("path/to/pdf.pdf")
pages = loader.load()
page = pages[0]
print(page.page_content[0:500])
page.metadata

## YouTube

In [None]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [10]:
# ! pip install yt_dlp
# ! pip install pydub

In [None]:
url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

In [None]:
docs[0].page_content[0:500]

## URLs

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")
docs = loader.load()
print(docs[0].page_content[:500])

## Notion

For this you must first export as Markdown and CSV

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()

In [None]:
print(docs[0].page_content[0:200])

In [None]:
docs[0].metadata