This is the plan:
Input files: transcript.txt, transcript.json
1. Choose chunk size / overlap for the transcript and create a vectorstore
3. Create a function that accepts a query and returns the most relevant section(s) of text
4. Use transcript.json to also return the associated timestamps for the relevant sections
5. Modify the interface to be like chat.py, where the transcript can be "read" and queries typed/pasted, with outputs being the appropriate doc excerpts

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_together.embeddings import TogetherEmbeddings


from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from langchain_core.documents import Document

import uuid
from os.path import exists
# import json

from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema import SystemMessage#, AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

In [None]:
from config import EMBEDDING_CONTEXT_SIZE, CHUNK_SIZE, CHUNK_OVERLAP
from helpers import get_vectorstore, save_vector, save_response_to_markdown_file

In [None]:
from models import get_together_fn_mix, get_claude_opus
ACTIVE_LLM = get_claude_opus()

In [None]:
loaders = [
    TextLoader("example-simple.txt"),
    # TextLoader("example.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.split_documents(docs)
print(len(docs))

In [None]:
# filename = "paper-embeddings"
embedder = TogetherEmbeddings(model="BAAI/bge-large-en-v1.5")
# if exists(f"vector-dbs/{filename}"):
#     print("Embeddings already exist!")
#     vectorstore = get_vectorstore(embedder=embedder, local_vector_path=filename)
# else:
#     vectorstore = get_vectorstore(embedder=embedder, documents=docs)
#     save_vector(vectorstore, filename)
#     print(f"Embeddings saved to vector-dbs/{filename}")


In [None]:
def create_chroma_vectorstore(embedder, collection_name = "test_collection", docs = None):
    """
    Create a vectorstore from documents
    """
    filename = f"chroma-vector-dbs/{collection_name}"
    is_local = False
    if exists(filename):
        print("Note: Collection seems to already exist! Not adding documents to the collection.")
        is_local = True
    else:
        if docs is None:
            raise ValueError("Collection not found. Provide documents to create a new collection")
    vectorstore = Chroma(
        collection_name=collection_name, 
        embedding_function=embedder,
        persist_directory=filename,
        client_settings= Settings(anonymized_telemetry=False, is_persistent=True),
    )
    # if is_local is False:
    #     vectorstore.add_documents(docs)
    return vectorstore

In [None]:
vectorstore = create_chroma_vectorstore(embedder, collection_name="test_collection", docs=docs)

In [None]:
doc_ids = [str(uuid.uuid4()) for _ in docs]
for doc in docs:
    # doc.metadata["doc_id"] = doc_ids.pop(0)
    doc.metadata = {"doc_id": doc_ids.pop(0)}
    print(doc.metadata)


In [None]:
docs

In [None]:
# vectorstore.similarity_search("What is the massed repetition?", k=2)
# vectorstore.similarity_search_with_relevance_scores("What is the spacing effect?", k=2)

In [None]:
# # read the file iceAge.json
# import json
# with open('iceAge.json', 'r') as myfile:
#     data=myfile.read()
# # parse file
# obj = json.loads(data)
# transcript = obj["output"]["text"]

# print(transcript)
# # save to transcript.txt
# with open('transcript.txt', 'w') as file:
#     file.write(transcript)

Specific steps:
1. Add document to appropriate folder in repo
2. Set config.py with values for: 
    - document name
    - chunk size
    - overlap
3. Assert that document is present and vectorstore doesn't already exist
4. Create vectorstore using config values
5. Save initial vectorstore
6. Add Q&A or questions as metadata for each page

In [None]:
# # load vectorstore
# # The vectorstore to use to index the child chunks
# vectorstore = Chroma(collection_name="full_documents", embedding_function=embedder,
#                      client_settings= Settings( anonymized_telemetry=False, is_persistent=True, )
# )
# # The storage layer for the parent documents
# store = InMemoryByteStore()
# id_key = "doc_id"
# # The retriever (empty to start)
# retriever = MultiVectorRetriever(
#     vectorstore=vectorstore,
#     byte_store=store,
#     id_key=id_key,
# )
# import uuid

# doc_ids = [str(uuid.uuid4()) for _ in docs]

In [None]:
# # The splitter to use to create smaller chunks
# child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)

In [None]:
# sub_docs = []
# for i, doc in enumerate(docs):
#     _id = doc_ids[i]
#     _sub_docs = child_text_splitter.split_documents([doc])
#     for _doc in _sub_docs:
#         _doc.metadata[id_key] = _id
#     sub_docs.extend(_sub_docs)

In [None]:
# retriever.vectorstore.add_documents(sub_docs)
# retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
# retriever.vectorstore.similarity_search("spaced memory")

In [None]:
# retriever.get_relevant_documents("What is the role of spaced memory?")[0].page_content

In [None]:
hypothetical_questions: list[list[str]] = []
for i in range(len(docs)):
    questions = []
    questions.append(f"What is example question {i}?")
    questions.append(f"Is this example question {i}?")
    hypothetical_questions.append(questions)

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="hypo-questions", embedding_function=embedder,
    client_settings= Settings( anonymized_telemetry=False, is_persistent=True, )
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

assert len(hypothetical_questions) == len(docs)
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

In [None]:
for i in docs:
    if i is not None:
        print(i)


In [None]:
for i in question_docs:
    if i is not None:
        print(i)

In [None]:
type(docs)

In [None]:
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [None]:
new_docs = []
for docs in docs:
    new_doc = Document(page_content=doc.page_content, metadata={id_key: doc_ids[i]})
    new_docs.append(new_doc)


In [None]:
new_docs

In [None]:
retriever.vectorstore.add_documents(docs)
# retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
retriever.vectorstore.similarity_search("What is massed repetition?", k=2)

In [None]:
def print_docs(_dict):
    # print("now attempting to print docs")
    context = "\n\n".join([doc.page_content for doc in _dict["context"]])
    # print(_dict["context"])
    _dict["context"] = context
    # return _dict["context"]
    print(context)
    return _dict
rag_template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(rag_template)
prompt.messages.insert(0, 
   SystemMessage(
       content="You are an expert AI. Answer to the best of your ability, following the instructions."
   )
  )
retrieved_docs = {"context": retriever, "question": RunnablePassthrough()}
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | RunnableLambda(print_docs)
    | prompt
    | ACTIVE_LLM
    | StrOutputParser()
)

In [None]:
chain.invoke("What is spaced practice and massed practice?")