In [36]:
# !pip install langchain # 0.0.333
# !pip install sentence-transformers # 2.2.2
# !pip install faiss-cpu # 1.7.4

# Import libraries

In [3]:
import faiss
import numpy as np
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Load, split, embed and store data

In [4]:
loader = TextLoader("data/Transcript Otter - A1.txt")
documents = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 200,
    length_function = len,
    add_start_index = True,
)

split_documents = text_splitter.split_documents(documents)

for doc in split_documents:
    doc.metadata["user_id"] = 1
print(len(split_documents))
print(split_documents[0:2])

151
[Document(page_content="Unknown Speaker  1:19\nHi Can you hear me\n\nUnknown Speaker  1:37\nI can't hear you could be me. Hang on\n\nUnknown Speaker  1:56\nknow let me\n\nSpeaker 1  2:07\nknow Hello.\n\nSpeaker 2  2:13\nI can hear you now that's fine I think it was my yes my speakers weren't working for some reason I think it's just been plugged in upstairs and I unplug it from my computer upstairs it kind of doesn't like it and messes around a little bit.\n\nSpeaker 1  2:25\nThat's right, Mike, how are you? Yeah, good. Thank\n\nUnknown Speaker  2:28\nyou. Oh by itself.\n\nSpeaker 1  2:30\nYes. Not bad at all about all good stuff.", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 0, 'user_id': 1}), Document(page_content="Speaker 1  2:25\nThat's right, Mike, how are you? Yeah, good. Thank\n\nUnknown Speaker  2:28\nyou. Oh by itself.\n\nSpeaker 1  2:30\nYes. Not bad at all about all good stuff.\n\nUnknown Speaker  2:33\nHave you been up to my show?\n\nSpeaker 1  

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="nreimers/MiniLM-L6-H384-uncased")
embedding_model

No sentence-transformers model found with name /Users/PareshSharma/.cache/torch/sentence_transformers/nreimers_MiniLM-L6-H384-uncased. Creating a new one with MEAN pooling.


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='nreimers/MiniLM-L6-H384-uncased', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [7]:
db = FAISS.from_documents(split_documents, embedding_model)
db.save_local("data/vector_store")

# Load vector store

In [8]:
new_db = FAISS.load_local("data/vector_store", embedding_model)

# Find k neighbout documents given query and user id

In [12]:
new_db.similarity_search_with_score("how generous I'm feeling at", k=1, filter=dict(user_id=1))

[(Document(page_content="Speaker 1  36:06\nYes. So I might leave a little bit to my niece and nephews I don't know see how generous I'm feeling at 90", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 31591, 'user_id': 1}),
  15.647223)]

In [13]:
new_db.as_retriever(search_kwargs={'filter': {'user_id':1}, 'k': 1})

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x12a1f7100>, search_kwargs={'filter': {'user_id': 1}, 'k': 1})

# Other queries

In [15]:
# see all documents in vectors store
new_db.docstore._dict

# get length of chunked documents in vector store 
len(new_db.docstore._dict)

# add documents for a new user
split_documents = text_splitter.split_documents(documents)

for doc in split_documents:
    doc.metadata["user_id"] = 2
print(len(split_documents))
print(split_documents[0:2])


# new_db.add_documents(split_documents)

151
[Document(page_content="Unknown Speaker  1:19\nHi Can you hear me\n\nUnknown Speaker  1:37\nI can't hear you could be me. Hang on\n\nUnknown Speaker  1:56\nknow let me\n\nSpeaker 1  2:07\nknow Hello.\n\nSpeaker 2  2:13\nI can hear you now that's fine I think it was my yes my speakers weren't working for some reason I think it's just been plugged in upstairs and I unplug it from my computer upstairs it kind of doesn't like it and messes around a little bit.\n\nSpeaker 1  2:25\nThat's right, Mike, how are you? Yeah, good. Thank\n\nUnknown Speaker  2:28\nyou. Oh by itself.\n\nSpeaker 1  2:30\nYes. Not bad at all about all good stuff.", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 0, 'user_id': 2}), Document(page_content="Speaker 1  2:25\nThat's right, Mike, how are you? Yeah, good. Thank\n\nUnknown Speaker  2:28\nyou. Oh by itself.\n\nSpeaker 1  2:30\nYes. Not bad at all about all good stuff.\n\nUnknown Speaker  2:33\nHave you been up to my show?\n\nSpeaker 1  

In [16]:
# get the document given the vector id
index = faiss.read_index("data/vector_store/index.faiss")
vector_id = 42 # Get the vector ID to lookup 
vector = index.reconstruct(vector_id) # Reconstruct just the single vector for that ID
vector_np = np.array([vector]) # Convert to numpy array
# Search the index for nearest neighbor of the vector
# This will return the original row for that vector ID
_, I = index.search(vector_np, 1) 
row_id = I[0][0] # Get the row ID from the search results
row_data = split_documents[row_id] # Now lookup the row data using the row ID
row_data

Document(page_content="And you basically need to get another 22,000 pounds from some from somewhere. So effectively that monies then will then be subject to income tax. But you've still got your black, you know, Blackrock pension as well. And you've not taken you haven't taken your tax free cash from there yet have you know, so I think what was the is the real value of that plot about 100k Is it", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 13147, 'user_id': 2})

In [17]:
index.ntotal

151

In [18]:
# get all vectors back
index = faiss.read_index("data/vector_store/index.faiss")
vectors = index.reconstruct_n(0, index.ntotal)
vectors.shape

(151, 384)

# Model

In [9]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.document_transformers.long_context_reorder import LongContextReorder
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever


In [10]:
model_id = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map="auto", model_kwargs={"max_new_tokens": 50})
llm = HuggingFacePipeline(pipeline=pipe)

In [11]:
retriever = new_db.as_retriever(search_kwargs={'filter': {'user_id': 1}, 'k': 5})
memory = ConversationBufferMemory(
    llm=llm, memory_key="chat_history", return_messages=True, output_key='answer'
)


template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say "This wasn't discussed in the call.", don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
Chat History: {chat_history}
{context}
Question: {question}
Helpful Answer:"""
prompt = PromptTemplate(template=template, input_variables=['chat_history', 'context', 'question'])


reordering = LongContextReorder()

pipeline_compressor = DocumentCompressorPipeline(
    transformers=[
        reordering
    ]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=compression_retriever, memory=memory, return_source_documents=True, combine_docs_chain_kwargs={"prompt": prompt})

In [12]:

query = "What age might they leave something to their nephews and nieces?"
results = qa({"question": query})
results



{'question': 'What age might they leave something to their nephews and nieces?',
 'chat_history': [HumanMessage(content='What age might they leave something to their nephews and nieces?'),
  AIMessage(content='90.')],
 'answer': '90.',
 'source_documents': [Document(page_content="Speaker 1  36:06\nYes. So I might leave a little bit to my niece and nephews I don't know see how generous I'm feeling at 90", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 31591, 'user_id': 1}),
  Document(page_content="guys and see what kind of situation would be around that. And I think obviously for you know, for your kind of kind of stage at the moment, he probably wouldn't if you can do it without him or it's obviously the best case scenario.", metadata={'source': 'data/Transcript Otter - A1.txt', 'start_index': 18049, 'user_id': 1}),
  Document(page_content="Speaker 1  28:52\nI put, yeah, purchase it for 60. But the site fee was 2000 for this year, but I don't know if actual carav

In [None]:
# text2text generation - google model - xl 
# table of other models tried - question-answering, text-generation

# QA retrival
# conversational retrival with history from memory, long context retrieval: https://github.com/langchain-ai/langchain/issues/10834
# feed in prompt to conversational retriver
# prompt template - message if error, answer, question, context, chat_history

# save model and tokenizer and embeddings
# retriever, 
# while loop for other code, 
# know how to reset memory
# evaluation with ragas

tokenizer.max_length



# Optimisation
# Chunk size 
# model
# embeddings
# tokenizer
# chain type
# search type
# splitting function
# vector db



In [None]:
# TO DO
# To get original document back look into retrievers in langchain
# can use hugging face tokenizer for splitter? Increase chunk size, use tokens
# what are models like misteral and llma used for? Can I get embeddings?
# Will have to see if when we retrieve we can filter on the documents by user id, if not may need to use logic above to read in index (use langchain search feature to work this out)

In [44]:
# https://www.sbert.net/docs/pretrained_models.html
# popular sentence transformer and high performing: sentence-transformers/all-mpnet-base-v2
# High performing and trained on QA dataset: sentence-transformers/multi-qa-mpnet-base-dot-v1
# Smaller sentence embedding model 80MB: nreimers/MiniLM-L6-H384-uncased
# Smaller sentence embedding model 290MB: sentence-transformers/all-distilroberta-v1
# Popular QA model: deepset/roberta-base-squad2



# vector store - chose FAISS as it is open source. For production use case weaviate or pinecone could also be considered.

In [None]:
# look at previous langchain approach
# look at example RAG systems online


# load data into vector db with user id

# Loader for data or textloader

# splitting
# embed data
# add document to vector db