This is a 8 step process to display a goal value
1. Get prompt as string
2. Chain1: LCEL that guarantees a schema that fits step 3, or fails
3. Perform Tavily query
4. Chain2: LCEL that extracts most relevant url
5. Load documents into a docs object
6. Embed docs as a vectorstore and create a retriever
7. Chain3: LCEL that guarantees a schema that fits step 8
8. Display the goal value

For this task, the prompt is a specific question about a given college. The goal value results from the highest quality source publically available, typically the common data set officially released by the university.

In [None]:
# Import dependencies
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.tools.tavily_search import TavilySearchResults
# from langchain.tools import DuckDuckGoSearchRun
# from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader

In [None]:
# Import model
from models import get_together_fn_mix, get_together_fn_mistral
ACTIVE_LLM = get_together_fn_mix()
# ACTIVE_LLM = get_together_fn_mistral()

# Step 1:

In [None]:
# Step 1:
# Get prompt as string
USER_QUERY = "Is West Valley a good school?"

In [None]:
chain1_examples = [
    {"input": "What are admissions rates for CMU?", "output": "[{'name': 'SearchCollege', 'arguments': {'valid': True, 'query': 'admissions rates for Carnegie Mellon University', 'college': 'Carnegie Mellon University'}}]"},
    {"input": "How many rings does Saturn have?", "output": "[{'name': 'SearchCollege', 'arguments': {'valid': False, 'query': 'number of Saturn rings', 'college': 'None'}}]"},
    {"input": "Who created the super soaker?", "output": "[{'name': 'SearchCollege', 'arguments': {'valid': False, 'query': 'Who created the super soaker', 'college': 'None'}}]"},
    {"input": "What is De Anza College like?", "output": "[{'name': 'SearchCollege', 'arguments': {'valid': True, 'query': 'De Anza College', 'college': 'De Anza College'}}]"},
    {"input": "What is the acceptance rate for Stanford?", "output": "[{'name': 'SearchCollege', 'arguments': {'valid': True, 'query': 'acceptance rate for Stanford University', 'college': 'Stanford University'}}]"},
]
def get_few_shot_prompt(examples):
    # This is a prompt template used to format each individual example.
    example_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", "{input}"),
            ("ai", "{output}"),
        ]
    )
    few_shot_prompt = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=examples,
    )
    return few_shot_prompt
# print(get_few_shot_prompt(chain1_examples).format())

In [None]:
chain1_few_shot_prompt = get_few_shot_prompt(chain1_examples)
CHAIN1_PROMPT = ChatPromptTemplate.from_messages(
    [("system", "You are a helpful assistant who always calls an external function. If a valid college is not found in the query, mark the valid field as False."), 
     chain1_few_shot_prompt,
     ("human", "{input}")]
)
PARSER = JsonOutputParser()

# Step 2:

In [None]:
# Step 2:
# Chain1: LCEL that guarantees a schema that fits step 3, or fails
class SearchCollege(BaseModel):
    """Construct a web search for a college. Only mark valid field as true if a valid college name is found."""
    valid: bool = Field(description="Mark as false if a college name is not found in the query.")
    query: str = Field(description="Optimized query for search engine (include college name if present)")
    college: str = Field(description="Name of the college.")
def is_chain1_valid(response_object):
    return response_object[0]['arguments']['valid'] is True

tools = [convert_to_openai_tool(SearchCollege)]
tool_choice={"type": "function", "function": {"name": "SearchCollege"}}
chain1 = CHAIN1_PROMPT | ACTIVE_LLM.bind(tools=tools, tool_choice=tool_choice)
print("Successfully constructed chain1")

In [None]:
# tools

In [None]:
# Call chain1
chain1_prompt = USER_QUERY
response1 = chain1.invoke({"input": chain1_prompt})
try:
    response_object1 = PARSER.parse(response1.content)
except:
    raise ValueError("Chain1 did not return valid JSON")
if not is_chain1_valid(response_object1):
    raise ValueError("Chain1 failed schema conditions")
# print(response_object)

# Step 3:

In [None]:
# Step 3:
# Perform Tavily query
TAVILY_MAX_RESULTS = 15
search = TavilySearchResults(max_results=TAVILY_MAX_RESULTS)
def get_tavily_query(response_object):
    assert is_chain1_valid(response_object), "Chain1 output is not valid"
    college = response_object[0]['arguments']['college']
    query = college + " Common Data Set 2023 filetype:pdf"
    return query

In [None]:
# Finalize Tavily query
print(response_object1)
final_query = get_tavily_query(response_object1)
print(final_query)
COLLEGE_NAME = response_object1[0]['arguments']['college']

In [None]:
# Invoke Tavily search
results = search.invoke(final_query)
print(f"Top {TAVILY_MAX_RESULTS} results from Tavily:")
for res in results:
    print(res["url"])

# Step 4:

In [None]:
chain2_examples = [
    {
        "input": """Choose the most relevant url index for the latest released data by Lehigh University: 
0: https://data.lehigh.edu/sites/oirsa.lehigh.edu/files/CDS_2021-2022.pdf
1: https://www.williams.edu/institutional-research/files/2023/04/CDS_2022_2023_Williams_March2023.pdf
2: https://my.wlu.edu/document/2022-common-data-set
3: https://data.lehigh.edu/sites/oirsa.lehigh.edu/files/CDS_2022-2023.pdf
4: https://www.haverford.edu/sites/default/files/Office/President/CDS_2022-2023.pdf""", 
        "output": "[{'name': 'selectUrl', 'arguments': {'index': 3}}]"
    },
]

In [None]:
chain2_few_shot_prompt = get_few_shot_prompt(chain2_examples)
CHAIN2_PROMPT = ChatPromptTemplate.from_messages(
    [("system", "You are a helpful assistant who always calls an external function. Choose the correct url for the given college."),
     chain2_few_shot_prompt,
     ("human", "{input}")]
)

In [None]:
# Step 4:
# Extract relevant url(s)
def format_url_choice_string(urls, college):
    return f"Choose the most relevant url index for the latest released data by {college}: \n" + "\n".join([f"{i}: {url}" for i, url in enumerate(urls)])
chain2_needed = False
chain2 = None
if not results:
    raise ValueError("No results found")
urls = [result['url'] for result in results]
assert len(urls) > 0, "No urls found"
if len(urls) == 1:
    print('Skipping chain2')
else:
    print('Selecting the best url using chain2')
    chain2_needed = True

if chain2_needed:
    class selectUrl(BaseModel):
        """Choose the most relevant url index for the given college."""
        index: int = Field(description="The index of the url to select")
    select_url_tools = [convert_to_openai_tool(selectUrl)]
    tool_choice={"type": "function", "function": {"name": "selectUrl"}}
    chain2 = CHAIN2_PROMPT | ACTIVE_LLM.bind(tools=tools, tool_choice=tool_choice)

In [None]:
# Call chain2 if needed to select the most relevant url
# This chain only performs well with the MIXTRAL model, and not the MISTRAL model
relevant_url = None
if chain2 is not None:
    chain2_prompt = format_url_choice_string(urls, COLLEGE_NAME)
    print(chain2_prompt)
    response2 = chain2.invoke({"input": chain2_prompt})
    try:
        response_object2 = PARSER.parse(response2.content)
    except:
        raise ValueError("Chain2 did not return valid JSON")
    chain_2_response_index = response_object2[0]['arguments']['index']
    # print(response_object)
    assert chain_2_response_index < len(urls), "Index out of range"
    relevant_url = urls[chain_2_response_index]
else:
    relevant_url = urls[0]
print('\n' + relevant_url)

# Step 5:

In [None]:
# Step 5:
# Load documents into a docs object
loader = PyPDFLoader(relevant_url)
pages = loader.load()

In [None]:
print(len(pages), "pages loaded")
for page in pages:
    print(page.page_content[:500])
    print('\n')