In [None]:
from langchain.docstore.document import Document
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document

## Document Loaders; How to load documents from a variety of sources.

Combining language models with your own text data is a powerful way to `differentiate` them. The first step in doing this is to load the data into `Documents` - a fancy way of say some pieces of text.

### `Copy Paste`

In [None]:
text = "this is a text I just copy pasted."

doc = Document(page_content=text)
doc

In [None]:
## We can also add metadata

metadata = {"source": "internet", "date": "Friday"}
doc = Document(page_content=text, metadata=metadata)
doc

### `CSV`

In [None]:
loader = CSVLoader(file_path="sms_spam.csv", encoding="latin1")
data = loader.load()

type(data), type(data[0])

In [None]:
data[0]

In [None]:
## Customizing the file

loader = CSVLoader(
    file_path = "sms_spam.csv",
    encoding = "latin1",
    source_column = "v1" # changing the source from the `filename` to the `spam/ham` category
)

data = loader.load()

data[0]

In [None]:
## We can also change the way we parse the file to LangChain

loader = CSVLoader(
    file_path = "sms_spam.csv",
    encoding="latin1",
    csv_args = {
        "delimiter": ',',
        "quotechar": '"',
        "fieldnames": ['MLB Team', 'Payroll in millions', 'Wins']
    }
)

### `PDF`

In [None]:
loader = PyPDFLoader("/content/Waves-Coordination.pdf")
pages = loader.load_and_split()

type(pages), type(pages[0])

In [None]:
pages[0]

### `More`

You can find more Document Loaders here: https://python.langchain.com/docs/modules/data_connection/document_loaders/