In [1]:
# Import libraries
from dotenv import load_dotenv
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParserLocal
import torch
from transformers import WhisperProcessor
from tqdm import tqdm
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()

True

In [3]:
save_dir = "Downloads/YouTube"

In [4]:
# Dr. Huberman dopamine videos
urls = ["https://www.youtube.com/watch?v=lIo9FcrljDk", "https://www.youtube.com/watch?v=Se151brgGSM"]

In [5]:
# Initialize WhisperProcessor for forced decoder IDs
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")

# Initialize the parser
parser = OpenAIWhisperParserLocal(
    device="cuda" if torch.cuda.is_available() else "cpu",
    lang_model="openai/whisper-medium",
    forced_decoder_ids=forced_decoder_ids,
    batch_size=8,
    chunk_length=30
)

# Initialize the audio loader
audio_loader = YoutubeAudioLoader(urls, save_dir)

Using the following model:  openai/whisper-medium


In [None]:
# Load and parse the documents 
docs = []

for url in tqdm(urls, desc="Loading YouTube Audio", unit="video"):
    audio_loader.urls = [url]
    loader = GenericLoader(blob_loader=audio_loader, blob_parser=parser)
    docs.extend(loader.load())

Loading YouTube Audio:   0%|                                                                                                | 0/2 [00:00<?, ?video/s]

[youtube] Extracting URL: https://www.youtube.com/watch?v=lIo9FcrljDk
[youtube] lIo9FcrljDk: Downloading webpage
[youtube] lIo9FcrljDk: Downloading ios player API JSON
[youtube] lIo9FcrljDk: Downloading mweb player API JSON
[youtube] lIo9FcrljDk: Downloading m3u8 information
[info] lIo9FcrljDk: Downloading 1 format(s): 140
[download] Destination: Downloads\YouTube\Essentials： Master Your Sleep & Be More Alert When Awake.m4a

[FixupM4a] Correcting container of "Downloads\YouTube\Essentials： Master Your Sleep & Be More Alert When Awake.m4a"
[ExtractAudio] Not converting audio Downloads\YouTube\Essentials： Master Your Sleep & Be More Alert When Awake.m4a; file is already in target format m4a
Transcribing part Downloads\YouTube\Controlling Your Dopamine For Motivation, Focus & Satisfaction.m4a!


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [None]:
docs[0].page_content[0:100]

In [None]:
# Combine documents
combined_docs = [doc.page_content for doc in docs]
text = " ".join(combined_docs)

In [None]:
# Split them
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
splits = text_splitter.split_text(text)

In [None]:
# Build an index
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma.from_texts(splits, embeddings)

In [None]:
# Build a QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
)