In [36]:
# !pip install langchain # 0.0.333
# !pip install sentence-transformers # 2.2.2
# !pip install faiss-cpu # 1.7.4

# Import libraries

In [180]:
import faiss
import numpy as np
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Load, split, embed and store data

In [181]:
loader = TextLoader("data/Transcript Otter - A1.txt")
documents = loader.load()

In [182]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True,
)

split_documents = text_splitter.split_documents(documents)

for doc in split_documents:
    doc.metadata["user_id"] = 1
print(len(split_documents))
print(split_documents[0:2])

672
[Document(page_content='Unknown Speaker  1:19  \nHi Can you hear me', metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 0, 'user_id': 1}), Document(page_content="Unknown Speaker  1:37  \nI can't hear you could be me. Hang on\n\nUnknown Speaker  1:56  \nknow let me", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 44, 'user_id': 1})]


In [184]:
embedding_model = HuggingFaceEmbeddings(model_name="nreimers/MiniLM-L6-H384-uncased")
embedding_model

No sentence-transformers model found with name /Users/PareshSharma/.cache/torch/sentence_transformers/nreimers_MiniLM-L6-H384-uncased. Creating a new one with MEAN pooling.


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='nreimers/MiniLM-L6-H384-uncased', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [185]:
db = FAISS.from_documents(split_documents, embedding_model)
db.save_local("data/vector_store")

# Load vector store

In [170]:
new_db = FAISS.load_local("data/vector_store", embedding_model)

# Find k neighbout documents given query and user id

In [189]:
new_db.similarity_search_with_score("how generous I'm feeling at", k=1, filter=dict(user_id=1))

[(Document(page_content="how generous I'm feeling at 90", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 31884, 'user_id': 1}),
  2.940438)]

In [198]:
new_db.as_retriever(search_kwargs={'filter': {'user_id':1}, 'k': 1})

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x2bb21d720>, search_kwargs={'filter': {'user_id': 1}, 'k': 1})

# Other queries

In [186]:
# see all documents in vectors store
new_db.docstore._dict

# get length of chunked documents in vector store 
len(new_db.docstore._dict)

# add documents for a new user
split_documents = text_splitter.split_documents(documents)

for doc in split_documents:
    doc.metadata["user_id"] = 2
print(len(split_documents))
print(split_documents[0:2])


new_db.add_documents(split_documents)

[(Document(page_content="how generous I'm feeling at 90", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 31884, 'user_id': 1}),
  2.940438)]

In [191]:
# get the document given the vector id
index = faiss.read_index("data/vector_store/index.faiss")
vector_id = 42 # Get the vector ID to lookup 
vector = index.reconstruct(vector_id) # Reconstruct just the single vector for that ID
vector_np = np.array([vector]) # Convert to numpy array
# Search the index for nearest neighbor of the vector
# This will return the original row for that vector ID
_, I = index.search(vector_np, 1) 
row_id = I[0][0] # Get the row ID from the search results
row_data = split_documents[row_id] # Now lookup the row data using the row ID
row_data

Document(page_content='it. Yeah, I will say it in my behalf a bit like my dogs fight just like the dogs. They just need to', metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 2565, 'user_id': 1})

In [193]:
index.ntotal

672

In [143]:
# get all vectors back
index = faiss.read_index("data/vector_store/index.faiss")
vectors = index.reconstruct_n(0, index.ntotal)
vectors.shape

(672, 384)

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x2bb21d720>, search_kwargs={'filter': {'user_id': 1}})

In [None]:
# TO DO
# To get original document back look into retrievers in langchain
# can use hugging face tokenizer for splitter? Increase chunk size, use tokens
# what are models like misteral and llma used for? Can I get embeddings?
# Will have to see if when we retrieve we can filter on the documents by user id, if not may need to use logic above to read in index (use langchain search feature to work this out)

In [44]:
# https://www.sbert.net/docs/pretrained_models.html
# popular sentence transformer and high performing: sentence-transformers/all-mpnet-base-v2
# High performing and trained on QA dataset: sentence-transformers/multi-qa-mpnet-base-dot-v1
# Smaller sentence embedding model 80MB: nreimers/MiniLM-L6-H384-uncased
# Smaller sentence embedding model 290MB: sentence-transformers/all-distilroberta-v1
# Popular QA model: deepset/roberta-base-squad2



# vector store - chose FAISS as it is open source. For production use case weaviate or pinecone could also be considered.

In [None]:
# look at previous langchain approach
# look at example RAG systems online


# load data into vector db with user id

# Loader for data or textloader

# splitting
# embed data
# add document to vector db