In [54]:
# %pip install youtube-transcript-api faiss-cpu

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


True

In [2]:
# embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
yt_video_id = "YyCXp_E4sMU"

In [5]:
try:
    transcript_obj = YouTubeTranscriptApi()
    transcript_list = transcript_obj.list(video_id=yt_video_id)
    transcript = transcript_list.find_transcript(['en'])
    transcript_data = transcript.fetch()

    data = " ".join([item.text for item in transcript_data])
except Exception as e:
    print(f"Error fetching transcript: {e}")
    transcript_data = []

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
documents = splitter.create_documents([data])

In [8]:
len(documents)

81

In [9]:
vector_store = FAISS.from_documents(documents, embeddings)

In [10]:
# vector_store.save_local("faiss_yt_rag")

In [11]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [12]:
retriever.invoke("Sleep is important for health.")

[Document(id='e38cd540-7da6-4298-8555-5ad2bbb53480', metadata={}, page_content="learn. There's a humongous iceberg underneath that surface of the water that's responsible for the tip of the iceberg to remain there. Remember always, it takes a lot of sleepless nights to become an"),
 Document(id='7cd7408b-c941-4646-bbcf-2273770decdd', metadata={}, page_content='mask on your face and nose and breathe normally. Make sure your mask is in place before you help children, infants or others. It is very important to put our mask in place before we can put the mask'),
 Document(id='9bcefc00-6a56-4cdd-b3c5-c764735d54c9', metadata={}, page_content='we can put the mask on somebody else. Mark Twain said something very interesting. He said there are two most important days in life. One is the day when we were born and second is the day we find out'),
 Document(id='e98ba032-b4fb-407c-8a42-59be30700db5', metadata={}, page_content="nights to become an overnight success. And therefore, and therefore, her

In [13]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="Answer the question based on the context provided.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:",
)

In [14]:
question = "Benefits of sleep"
retrieved_docs = retriever.invoke(question)

In [15]:
context = "\n\n".join([doc.page_content for doc in retrieved_docs])

In [16]:
final_prompt = prompt.invoke({"context": context, "question": question})
chat = ChatOpenAI(model="gpt-5-nano", temperature=0)

In [17]:
answer = chat.invoke(final_prompt)
print(answer)

content='- Restores energy and mental clarity\n- Keeps you ready to help others (self-care first, like putting your own mask on)\n- Prevents burnout during a long journey\n- Improves mood, motivation, and happiness\n- Supports sustained focus and compassion to make a difference\n\nIn short, sleep fuels you to perform well, stay resilient, and be more effective for yourself and others.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 1048, 'prompt_tokens': 457, 'total_tokens': 1505, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 960, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-D2aLPAnujcU1OR27TdvbSLOYHJvpW', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--84128d29-e180-46eb-bab0-80e5e7928865-0' usage_metadata={'input_toke