In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [2]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200,
chunk_overlap=20,
length_function=len,
is_separator_regex='False'
)

with open('knowledgeBase1.txt',mode='r',encoding='utf-8') as file:
      text = file.read()
texts = text_splitter.create_documents([text])

len(texts)

248

In [22]:

from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(texts))]

vector_store.add_documents(documents=texts, ids=uuids)


['1bdb10f8-ae91-4827-9898-fc51650ba3fc',
 '741a472e-9f0d-49c9-9355-3189ea7ac6e8',
 '32332cbf-8d80-41d4-aa29-eddaa830bbb6',
 'a8caf696-8db1-47db-96dd-1b726ce9b4c9',
 '161e475c-5534-4639-96df-238e1aadc62d',
 'c9233699-3657-45d7-895d-2a0a08f44576',
 '70a9c52f-73ca-4b4f-9a2c-034bbbb04c3a',
 '720be2f0-f475-4925-94c1-521f95db680f',
 '2140d54b-8119-4777-8d29-f4969e5cdd68',
 'd5d785f5-a4f0-42bb-9037-6cb4fbae1fed',
 'a7e6e567-0094-452e-872e-3b1d576f724f',
 '5a185fe7-5f39-4674-8c55-20d986470e41',
 '9bebb361-d357-4a71-8d97-69916bce51eb',
 '2098621e-727d-4f66-be35-773757cbfb4f',
 'dae24770-5324-46be-938a-d9e179e90ded',
 '30be720d-75e6-47c0-8d3a-9300e470987f',
 '73d9c284-bf18-40ca-b2b1-e341a5f0e141',
 '4342c4e6-d822-411e-be2a-9f55898afcfe',
 '6b3173bd-eed3-43bd-9b20-9df20c1c2918',
 '043c6a2d-e3dd-42e7-83b0-c4301746ec95',
 'd60bdf33-4dc0-4c1b-a077-eca064c80147',
 'b6351f18-6d9c-403a-a39a-5ca60e2f7001',
 '4e903e36-ddc9-46d4-a652-e1f793aedd3e',
 '940eebe3-fa75-4826-a001-53f6078742b8',
 '304767ba-895e-

In [30]:
vector_store.save_local("faiss_index")

new_vector_store = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)

docs = new_vector_store.similarity_search("qux")
docs[0]

Document(id='d39bb45c-5b29-4e46-97f9-7bc0bdd732fc', metadata={}, page_content='upon 2014 seq2seq technology,[10] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014.[11] The following year in 2018, BERT was introduced and quickly became')