# 4.2 文本切块

In [2]:
text = "你好你好#你好你好#你好你好" # your text
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "#",
    chunk_size = 9,
    chunk_overlap  = 0
)
docs = text_splitter.create_documents([text])
print(docs)

[Document(page_content='你好你好#你好你好'), Document(page_content='你好你好')]


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 3,
    chunk_overlap  = 0,
    length_function = len,
    separators=["。","，",""]
)
text ="枣。瓜。香蕉，菠萝蜜。梨"
texts = text_splitter.create_documents([text])
print(texts)


[Document(page_content='枣。瓜'), Document(page_content='。香蕉'), Document(page_content='，菠萝'), Document(page_content='蜜'), Document(page_content='。梨')]


In [4]:
text = "表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。"*2 # your text
from langchain.text_splitter import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=2,chunk_overlap=0)
docs = text_splitter.split_text(text)
print(docs)

['表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。表示两个切分文本之间的重合度，文本块之间的最大重叠量，保留一些重叠可以保持文本块之间的连续性，可以使用滑动窗口进行构造，这个比较重要。']


In [None]:
text = "..." # your text
from langchain.text_splitter import SpacyTextSplitter
text_splitter = SpacyTextSplitter()
docs = text_splitter.split_text(text)

In [None]:
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

p = pipeline(
    task=Tasks.document_segmentation,
    model='damo/nlp_bert_document-segmentation_chinese-base')

result = p(documents='......')
print(result[OutputKeys.TEXT])

# 4.3 向量数据库

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
os.environ["openai_api_base"] = "xxx"
os.environ["openai_api_key"] = "xxx"
embeddings = OpenAIEmbeddings()
texts = ["FAISS is an important library", "LangChain supports FAISS"]
faiss = FAISS.from_texts(texts, embeddings)
docs_and_scores = faiss.similarity_search_with_score("library",k=2)
print(docs_and_scores)

[(Document(page_content='FAISS is an important library'), 0.38126546), (Document(page_content='LangChain supports FAISS'), 0.5099826)]
