# Query Construction

In [1]:
from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL

url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

docs = YoutubeLoaderDL.from_youtube_url(
    url,
    add_video_info=True
).load()



In [2]:
docs[0].metadata

{'source': 'dQw4w9WgXcQ',
 'title': 'Rick Astley - Never Gonna Give You Up (Official Video) (4K Remaster)',
 'description': 'The official video for “Never Gonna Give You Up” by Rick Astley. \n\nNever: The Autobiography 📚 OUT NOW! \nFollow this link to get your copy and listen to Rick’s ‘Never’ playlist ❤️ #RickAstleyNever\nhttps://linktr.ee/rickastleynever\n\n“Never Gonna Give You Up” was a global smash on its release in July 1987, topping the charts in 25 countries including Rick’s native UK and the US Billboard Hot 100.  It also won the Brit Award for Best single in 1988. Stock Aitken and Waterman wrote and produced the track which was the lead-off single and lead track from Rick’s debut LP “Whenever You Need Somebody”.  The album was itself a UK number one and would go on to sell over 15 million copies worldwide.\n\nThe legendary video was directed by Simon West – who later went on to make Hollywood blockbusters such as Con Air, Lara Croft – Tomb Raider and The Expendables 2.  The v

Let's assume, we have build an index that:
1. Allows us to perform ustructured search over the contents and title of each document.
2. And to use range filtering on view count , publications date, and length

We want to convert natural language into structured search queries
We can define a schema for unstructured search query   

In [21]:
import datetime
from typing import List, Optional, Tuple
from pydantic import BaseModel, Field

class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library"""
    
    content_search: str = Field(..., description="Similary search query applied to video transcription.")
    title_search: str = Field(..., description=("Alternate version of the content search query to apply to video title. Should be succinct and only include key words that could be in a video title."))
    min_view_count: Optional[int] = Field(None, description="Minimum view count filter, inclsive. Only use if explicitly specified.") 
    max_view_count: Optional[int] = Field(None, description="Minimum view count filter, exclusive. Only use if explicitly specified.") 
    earliest_publish_date: Optional[datetime.date] = Field(None, description="Earliest publish date filter, inclusive. Only use if explicitly specified.")
    latest_publish_date: Optional[datetime.date] = Field(None, description="Latest publish date filter, exclusive. Only use if explicitly specified.")
    min_length_sec: Optional[int] = Field(None, description="Minimum video length in seconds, inclusive. Only use if explicitly specified.")
    max_length_sec: Optional[int] = Field(None, description="Maximum video length in seconds, exclusive. Only use if explicitly specified." )
    
    def pretty_print(self) -> None:
        for field_name, field_info in self.__class__.model_fields.items():
            value = getattr(self, field_name)
            default = field_info.default
            if value is not None and value != default:
                print(f"{field_name}: {value}")

    

Now we prompt the LLM to produce queries

In [10]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

system="""You are an expert an user question to database queries. \n
    You have access to database of tutorial videos about a software library for building LLM-powered applications. \n
    Given a question, return the database query optimized to retrieve the most relevant results.   
    If there are acronyms or words you are not familiar with, do try to rephrase them.    
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system), 
        ("human", "{question}")
    ]
) 

llm = ChatOpenAI()
structured_llm = llm.with_structured_output(TutorialSearch)
query_analyzer = prompt | structured_llm



In [22]:
query_analyzer.invoke({"question": "rag from scratch"}).pretty_print()
 

content_search: LLM-powered applications
title_search: build LLM-powered applications from scratch


In [24]:
query_analyzer.invoke({"question": "video on chat language published in 2013"}).pretty_print()

content_search: chat language
title_search: 2013
earliest_publish_date: 2013-01-01
latest_publish_date: 2014-01-01


In [27]:
query_analyzer.invoke({"question": "video focusing on topic of chat language that are published before 2024"}).pretty_print()

content_search: chat language
title_search: chat language
latest_publish_date: 2024-01-01


In [25]:
query_analyzer.invoke({"question": "How to use multimodel models in agent, only videos under 5 minutes" }).pretty_print()

content_search: multimodel models in agent
title_search: multimodel models agent
max_length_sec: 300


To connect it to various vectorstores, follow https://python.langchain.com/docs/how_to/self_query/ 