In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_together.embeddings import TogetherEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain_community.llms import Together
from langchain_core.prompts import ChatPromptTemplate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import PyPDFLoader

In [None]:
together_embeddings = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-2k-retrieval")
mistral = Ollama(model="mistral", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")
document_chain = create_stuff_documents_chain(mistral, prompt)

In [None]:
def get_pdf_pages(filepath):
    loader = PyPDFLoader(filepath)
    pages = loader.load_and_split()
    if not pages:
        print("Failed to load PDF.")
        return None
    return pages

def get_documents(doc_type, input):
    assert doc_type in ["text", "url", "file"]
    if doc_type == "text":
        documents = [input]
    elif doc_type == "url":
        loader = WebBaseLoader()
        documents = loader.load_documents()
    elif doc_type == "file":
        documents = get_pdf_pages(input)
    return documents

def set_text_splitter(chunk_size=None, chunk_overlap=None):
    chunk_size = chunk_size or 4000
    chunk_overlap = chunk_overlap or 200
    return RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

def split_docs(docs, text_splitter):
    documents = text_splitter.split_documents(docs)
    if not documents:
        print('No documents found')
        return None
    print(f'Found {len(documents)} documents')
    return documents

def get_vector(documents, embeddings=together_embeddings):
    vector = FAISS.from_documents(documents, embeddings)
    return vector

def get_retrieval_chain(vector, document_chain):
    retriever = vector.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    return retrieval_chain

def get_response(retrieval_chain, query):
    response = retrieval_chain.invoke({"input": query})
    return response["answer"]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3, chunk_overlap=0)

In [None]:
text_splitter.create_documents(['hello'])

In [None]:
from helpers import documents_from_text, split_docs

documents = documents_from_text('hello there', text_splitter)

In [None]:
documents

In [None]:
text_splitter.create_documents(['hello', 'b', 'there', 'a'])

In [None]:
new_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1, chunk_overlap=0)

In [None]:
split_docs(documents, new_text_splitter)