This is a 8 step process to display a goal value
1. Get prompt as string
2. Chain1: LCEL that guarantees a schema that fits step 3, or fails
3. Perform Tavily query
4. Chain2: LCEL that extracts most relevant url
5. Load documents into a docs object
6. Embed docs as a vectorstore and create a retriever
7. Chain3: LCEL that guarantees a schema that fits step 8
8. Display the goal value

For this task, the prompt is a specific question about a given college. The goal value results from the highest quality source publically available, the common data set officially released by the university.

In [None]:
"""
This is a 8 step process to display a goal value
1. Get prompt as string
2. Chain1: LCEL that guarantees a schema that fits step 3, or fails
3. Perform Tavily query
4. Chain2: LCEL that extracts most relevant url
5. Load documents into a docs object
6. Embed docs as a vectorstore and create a retriever
7. Chain3: LCEL that guarantees a schema that fits step 8
8. Display the goal value
"""

In [None]:
# Import dependencies
from langchain_community.utils.openai_functions import (
    convert_pydantic_to_openai_function,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.tools.tavily_search import TavilySearchResults
# from langchain.tools import DuckDuckGoSearchRun
# from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader

In [None]:
# Import model
from models import get_together_fn_mix
ACTIVE_LLM = get_together_fn_mix()

# Step 1:

In [None]:
# Step 1:
# Get prompt as string
USER_QUERY = "What are admissions rates for Santa Clara University?"

In [None]:
PROMPT = ChatPromptTemplate.from_messages(
    [("system", "You are a helpful assistant"), ("user", "{input}")]
)
PARSER = JsonOutputParser()

# Step 2:

In [None]:
# Step 2:
# Chain1: LCEL that guarantees a schema that fits step 3, or fails
class SearchCollege(BaseModel):
    """Construct a web search for a college"""
    query: str = Field(description="Optimized query for search engine, including college name")
    college: str = Field(description="Name of the college. Empty string if not specified")
def is_chain1_valid(response_object):
    return response_object[0]['arguments']['college'] != ''

college_functions = [convert_pydantic_to_openai_function(SearchCollege)]
chain1 = PROMPT | ACTIVE_LLM.bind(functions=college_functions, function_call={"name": "SearchCollege"})
print("Successfully constructed chain1")

In [None]:
# Call chain1
chain1_prompt = USER_QUERY
response = chain1.invoke({"input": chain1_prompt})
response_object = PARSER.parse(response.content)
if not is_chain1_valid(response_object):
    raise ValueError("Chain1 failed")
# print(response_object)

# Step 3:

In [None]:
# Step 3:
# Perform Tavily query
search = TavilySearchResults(max_results=5)
def get_tavily_query(response_object):
    assert is_chain1_valid(response_object), "Chain1 output is not valid"
    college = response_object[0]['arguments']['college']
    query = college + " Common Data Set 2023 filetype:pdf"
    return query

In [None]:
# Finalize Tavily query
print(response_object)
final_query = get_tavily_query(response_object)
print(final_query)
COLLEGE_NAME = response_object[0]['arguments']['college']

In [None]:
# Call Tavily search
results = search.invoke(final_query)
print("Top 5 results from Tavily:")
for res in results:
    print(res["url"])

# Step 4:

In [None]:
# Step 4:
# Extract relevant url(s)
def format_url_choice_string(urls, college):
    return f"Choose the most relevant url index for {college}: \n" + "\n".join([f"{i}: {url}" for i, url in enumerate(urls)])
chain2_needed = False
chain2 = None
if not results:
    raise ValueError("No results found")
urls = [result['url'] for result in results]
assert len(urls) > 0, "No urls found"
if len(urls) == 1:
    print('Skipping chain2')
else:
    print('Selecting the best url using chain2')
    chain2_needed = True

if chain2_needed:
    class selectUrl(BaseModel):
        """Choose the most relevant url index for the given college."""
        index: int = Field(description="The index of the url to select")
    select_url_functions = [convert_pydantic_to_openai_function(selectUrl)]
    chain2 = PROMPT | ACTIVE_LLM.bind(functions=select_url_functions, function_call={"name": "selectUrl"})


In [None]:
# Call chain2 if needed to select the most relevant url
relevant_url = None
if chain2 is not None:
    chain2_prompt = format_url_choice_string(urls, COLLEGE_NAME)
    response = chain2.invoke({"input": chain2_prompt})
    response_object = PARSER.parse(response.content)
    # print(response_object)
    assert response_object[0]['arguments']['index'] < len(urls), "Index out of range"
    relevant_url = urls[response_object[0]['arguments']['index']]
else:
    relevant_url = urls[0]
print(relevant_url)

# Step 5:

In [None]:
# Step 5:
# Load documents into a docs object
loader = PyPDFLoader(relevant_url)
pages = loader.load()