In [1]:
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content="This is the main content of the document that will be embedded and searched",
    metadata = {
        "source": "file1.txt",
        "author": "John Doe",
        "date": "2023-10-01"
    }
)
print("This a basic LangChain Document format")
print(f"Content : {doc.page_content}")
print(f"Metadata : {doc.metadata}")

This a basic LangChain Document format
Content : This is the main content of the document that will be embedded and searched
Metadata : {'source': 'file1.txt', 'author': 'John Doe', 'date': '2023-10-01'}


### TextInserting and Text Parsing

In [3]:
import os 
os.makedirs("data/text_files",exist_ok=True)

In [4]:
sample_text = {
    "data/text_files/python.txt" : "Python is a high-level, interpreted programming language known for its readability and versatility.",
    "data/text_files/java.txt" : "Java is a class-based, object-oriented programming language designed to have as few implementation dependencies as possible."
}

for file_path,content in sample_text.items():
    with open(file_path,'w',encoding = "utf-8") as f:
        f.write(content)

In [5]:
from langchain.document_loaders import TextLoader
loader = TextLoader("data/text_files/java.txt",encoding="utf-8")
document = loader.load();

print(document[0].metadata)
print(document[0].page_content)

{'source': 'data/text_files/java.txt'}
Java is a class-based, object-oriented programming language designed to have as few implementation dependencies as possible.


In [6]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

document_loader = DirectoryLoader(
    "data/text_files/",
    glob = "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs = {"encoding":"utf-8"}
)

documents = document_loader.load()
print(documents)


[Document(metadata={'source': 'data\\text_files\\java.txt'}, page_content='Java is a class-based, object-oriented programming language designed to have as few implementation dependencies as possible.'), Document(metadata={'source': 'data\\text_files\\python.txt'}, page_content='Python is a high-level, interpreted programming language known for its readability and versatility.')]


### Text Splitting :

In [7]:
print("Character Text Splitter :")
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = " ",
    chunk_size = 10,
    chunk_overlap = 4,
    length_function = len
)
texts = text_splitter.split_text(documents[0].page_content)
print(f"Splitted into {len(texts)} chunks")
print(texts[0])

Created a chunk of size 12, which is longer than the specified 10
Created a chunk of size 15, which is longer than the specified 10
Created a chunk of size 11, which is longer than the specified 10
Created a chunk of size 14, which is longer than the specified 10
Created a chunk of size 12, which is longer than the specified 10


Character Text Splitter :
Splitted into 12 chunks
Java is a


### Recursive Character Text Splitter

In [12]:
print("Recursive Character Text Splitter :")
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators = [" "],
    chunk_size = 30,
    chunk_overlap = 8,
    length_function = len
)
texts = text_splitter.split_text(documents[0].page_content)
print(f"Splitted into {len(texts)} chunks")
print(texts[0])
print(texts[1])
print(texts[2])

Recursive Character Text Splitter :
Splitted into 5 chunks
Java is a class-based,
object-oriented programming
language designed to have as


### TokenBased Splitting

In [None]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size = 10,
    chunk_overlap = 4
)
texts = text_splitter.split_text(documents[0].page_content)
print(f"Splitted into {len(texts)} chunks")
print(texts[0])