In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_together.embeddings import TogetherEmbeddings

from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.output_parsers import JsonOutputParser

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
import uuid

from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.schema import SystemMessage, AIMessage, HumanMessage

In [None]:
# MODELS
from models import get_together_fn_mix
# GPT_4_LLM = ChatOpenAI(max_retries=0, model="gpt-4")
ACTIVE_LLM = get_together_fn_mix()

In [None]:
loaders = [
    TextLoader("sample.txt"),
    # TextLoader("example.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)
print(len(docs))

In [None]:
docs

In [None]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:

chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 3 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ACTIVE_LLM.bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonOutputParser()
)

In [None]:
response = chain.invoke(docs[0])

In [None]:
# Get the questions as an array
questions = response[0]['arguments']['questions']
for q in questions:
    print(q)

In [None]:
search = TavilySearchResults()

In [None]:
search.invoke("What is the current ranking of Harvard University? site:niche.com")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
# model = f"togethercomputer/m2-bert-80M-2k-retrieval"
model = "BAAI/bge-large-en-v1.5"
# WhereIsAI/UAE-Large-V1	326M	1024	512
# BAAI/bge-large-en-v1.5	326M	1024	512
embedder = TogetherEmbeddings(model=model)

loader = WebBaseLoader("https://python.langchain.com/docs/expression_language/cookbook/multiple_chains")
docs = loader.load()
documents = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
).split_documents(docs)

vector = FAISS.from_documents(documents, embedder)
retriever = vector.as_retriever()

In [None]:
retriever.vectorstore.similarity_search("How do I make an LCEL chain?")

In [None]:
res_docs = retriever.get_relevant_documents("How do I string together multiple chains?")

In [None]:
res_docs[3]

In [None]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "langsmith_search",
    "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!",
)
tools = [search, retriever_tool]

In [None]:
college_functions = [
    {
        "name": "search_query",
        "description": "Generate 3 searches for the given college with the suffix site:niche.com",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:

college_chain = (
    ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 queries for {college} suffixed by site:niche.com"
    )
    | ACTIVE_LLM.bind(
        functions=college_functions, function_call={"name": "search_query"}
    )
    | JsonOutputParser()
)

In [None]:
response = college_chain.invoke({"college": "Carnegie Mellon University"})

In [None]:
questions = response[0]['arguments']['questions']
count = 0
res = None
for q in questions:
    res = search.invoke(q)
    count += 1
    if count == 1:
        break
assert res is not None, "No results found"
res

In [None]:
print(res[4]['content'])

In [None]:
results = search.invoke("Carnegie Mellon admissions site:niche.com")

In [None]:
results[4]['content']

In [None]:
retriever.get_relevant_documents("How would I create an agent in LangChain?")