# YouTube Agent with OpenAI Agents SDK
## ABB #4 - Session 4

Code authored by: Shaw Talebi

### imports

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
from agents import Agent, function_tool, Runner, ItemHelpers, RunContextWrapper
from openai.types.responses import ResponseTextDeltaEvent
from dotenv import load_dotenv
import asyncio

In [2]:
# import environment variables from .env file
load_dotenv()

True

### define instructions

In [3]:
instructions = "You provide help with tasks related to YouTube videos."

### define tool

In [4]:
@function_tool
def fetch_youtube_transcript(url: str) -> str:
    """
    Extract transcript with timestamps from a YouTube video URL and format it for LLM consumption
    
    Args:
        url (str): YouTube video URL
        
    Returns:
        str: Formatted transcript with timestamps, where each entry is on a new line
             in the format: "[MM:SS] Text"
    """
    # Extract video ID from URL
    video_id_pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    video_id_match = re.search(video_id_pattern, url)
    
    if not video_id_match:
        raise ValueError("Invalid YouTube URL")
    
    video_id = video_id_match.group(1)
    
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        
        # Format each entry with timestamp and text
        formatted_entries = []
        for entry in transcript:
            # Convert seconds to MM:SS format
            minutes = int(entry['start'] // 60)
            seconds = int(entry['start'] % 60)
            timestamp = f"[{minutes:02d}:{seconds:02d}]"
            
            formatted_entry = f"{timestamp} {entry['text']}"
            formatted_entries.append(formatted_entry)
        
        # Join all entries with newlines
        return "\n".join(formatted_entries)
    
    except Exception as e:
        raise Exception(f"Error fetching transcript: {str(e)}")

In [5]:
# print(fetch_youtube_transcript("https://youtu.be/ZaY5_ScmiFE"))

### create agent

In [6]:
agent = Agent(
    name="YouTube Transcript Agent",
    instructions=instructions,
    tools=[fetch_youtube_transcript],
)

### main() function

In [7]:
async def main():
    input_items = []

    print("=== YouTube Transcript Agent ===")
    print("Type 'exit' to end the conversation")
    print("Ask me anything about YouTube videos!")

    while True:
        # Get user input
        user_input = input("\nYou: ").strip()
        input_items.append({"content": user_input, "role": "user"})
        
        # Check for exit command
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("\nGoodbye!")
            break
            
        if not user_input:
            continue

        print("\nAgent: ", end="", flush=True)
        result = Runner.run_streamed(
            agent,
            input=input_items,
        )

        async for event in result.stream_events(): # not all events are available at outset, hence the async
            # We'll ignore the raw responses event deltas
            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
                print(event.data.delta, end="", flush=True)
            elif event.type == "agent_updated_stream_event":
                continue
            elif event.type == "run_item_stream_event":
                if event.item.type == "tool_call_item":
                    print("\n-- Fetching transcript...")
                elif event.item.type == "tool_call_output_item":
                    input_items.append({"content": f"Transcript:\n{event.item.output}", "role": "system"})
                    print("-- Transcript fetched.")
                elif event.item.type == "message_output_item":
                    input_items.append({"content": f"{event.item.raw_item}", "role": "assistant"})
                else:
                    pass  # Ignore other event types

        print("\n")  # Add a newline after each response

In [8]:
await main()
# try this video: https://youtu.be/ZaY5_ScmiFE

=== YouTube Transcript Agent ===
Type 'exit' to end the conversation
Ask me anything about YouTube videos!



You:  Can you summarize this video? https://youtu.be/ZaY5_ScmiFE



Agent: 
-- Fetching transcript...
-- Transcript fetched.
The video by Shaw discusses the concept of AI agents, a topic anticipated to become significant by 2025. Shaw begins by examining various definitions from leading organizations, highlighting that there's no consensus on what AI agents are. Key elements involve large language models (LLMs), tool usage, and autonomy. Shaw emphasizes the importance of tools in enabling LLMs to perform complex tasks and interact with reality.

The video outlines three levels of agentic systems:
1. **Level 1**: LLM plus tools – Expands LLM capabilities beyond text completion using tools like web search and code interpreters.
2. **Level 2**: LLM workflows – Multiple LLMs working together, often used in frameworks like LangGraph and Llama Index for more complex tasks.
3. **Level 3**: LLM in a loop – Continuous feedback and improvement using reinforcement learning, allowing tasks to be more autonomous and adaptable.

Shaw contrasts these agentic systems


You:  When specifically does he talk about LLM in a Loop?



Agent: Shaw discusses "LLM in a Loop" starting at [19:06] in the transcript. He explains how this concept involves giving an LLM real-world feedback until satisfactory responses are generated. He gives an example of writing LinkedIn posts with iterative feedback and adjustments, highlighting its capability for more open-ended tasks and the potential of using reinforcement learning for continuous improvement.




You:  Can you generate a link to this part of the video?



Agent: Here's a [link to the specific part](https://youtu.be/ZaY5_ScmiFE?t=1146) of the video where Shaw discusses "LLM in a Loop."




You:  quit



Goodbye!
