# Youtube RAG

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## Installing Dependencies 

In [2]:
!pip install -q youtube-transcript-api langchain-community langchain-openai faiss-cpu tiktoken python-dotenv

## Importing Libraries

In [26]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from urllib.parse import urlparse, parse_qs

## Utility Functions 

In [27]:
# Url parser to get video_id 
def get_video_id(url):
    """
    Extract video ID from various YouTube URL formats:
    - https://www.youtube.com/watch?v=VIDEO_ID
    - https://youtu.be/VIDEO_ID
    - https://m.youtube.com/watch?v=VIDEO_ID
    """
    # Handle youtu.be short URLs
    if 'youtu.be/' in url:
        return url.split('youtu.be/')[-1].split('?')[0]
    
    # Handle standard YouTube URLs
    parsed_url = urlparse(url)
    if parsed_url.hostname in ['www.youtube.com', 'youtube.com', 'm.youtube.com']:
        query = parsed_url.query
        video_id = parse_qs(query).get("v", [None])[0]
        return video_id
    
    # Try regex as fallback for embedded URLs or other formats
    pattern = r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})'
    match = re.search(pattern, url)
    return match.group(1) if match else None

## Steps

### Indexing (Document Ingestion)

In [28]:
def get_transcript(url):
    video_id = get_video_id(url)
    if !video_id: 
        return "Cannot fetch video ID from the URL"
    print(video_id)
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id=video_id, languages=['en'])

        # Flatten into plain text
        transcript = ' '.join(chunk['text'] for chunk in transcript_list)
        return transcript
    except TranscriptsDisabled:
        return "Caption disabled."
    except NoTranscriptFound:
        return "No transcript found for this video."
    except Exception as e: 
        return f"Error: {str(e)}"

In [30]:
transcript = get_transcript("https://www.youtube.com/watch?v=aircAruvnKk")
print(transcript)

aircAruvnKk
Error: no element found: line 1, column 0
