In [46]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
import re

##### Step 1: Indexing -- Document Ingestion

In [35]:
id = "SfOaZIGJ_gs"  # Video id from youtube, extract from URL for future works
YTapi = YouTubeTranscriptApi()
try:
    transcript_list = YTapi.fetch(id, languages = ["en"])
    #
    # The API provides results in a FetchecTranscriptSnippet object, in the following format:
    #snippets=[
        # FetchedTranscriptSnippet(
        #     text="Hey there",
        #     start=0.0,
        #     duration=1.54,
        # ),
        # FetchedTranscriptSnippet(
        #     text="how are you",
        #     start=1.54,
        #     duration=4.16,)
    #
    # The resulting transcript will be provided on the basis of timestamps, (when the text starts and till what duration it is maintained.)
    # The API gets the transcript sentence by sentence in a broken format, so we need to join it into a single plain text.
    #
    # Join text including the verbal cues
    raw_transcript = " ".join(chunk.text for chunk in transcript_list)
    #
    # Clean the text by removing these non-verbal cues using regex functions
    clean_transcript = re.sub(r"\[.*?\]", "", raw_transcript)
    clean_transcript = re.sub(r"\s+", " ", clean_transcript).strip()
    print(clean_transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")

uh where Nik you told him 5 minutes like he has 2 minutes not yeah 2 minutes Everything looks good. Just the monitor, the main monitor uh went off. Everyone who's done can leave. Hi, fam. Hey, Nicole. How are you? I'm good. Sorry I'm late. I got caught up in getting ready for the launch tomorrow and lost track of time and excitement with the final results. But no worries. I'm guessing it must be really hectic, right? It is a very hectic day. I have the model and I've been playing with it a little bit. How is it different, Sam? I'm not an expert at this. So yeah, there's all these ways we can talk about, you know, it's better at this metric or it's, you know, can do this amazing coding demo that the, you know, GPT4 couldn't. But the thing that has been most striking for me is in ways that are both big and small, going back from GPT5 to our previous generation model is just so painful. It's just like worse at everything. And I've taken for granted that there is a fluency and a depth of i

##### Text Splitting

In [36]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = splitter.create_documents([clean_transcript])

In [37]:
len(chunks)

49

In [38]:
chunks[0]

Document(metadata={}, page_content="uh where Nik you told him 5 minutes like he has 2 minutes not yeah 2 minutes Everything looks good. Just the monitor, the main monitor uh went off. Everyone who's done can leave. Hi, fam. Hey, Nicole. How are you? I'm good. Sorry I'm late. I got caught up in getting ready for the launch tomorrow and lost track of time and excitement with the final results. But no worries. I'm guessing it must be really hectic, right? It is a very hectic day. I have the model and I've been playing with it a little bit. How is it different, Sam? I'm not an expert at this. So yeah, there's all these ways we can talk about, you know, it's better at this metric or it's, you know, can do this amazing coding demo that the, you know, GPT4 couldn't. But the thing that has been most striking for me is in ways that are both big and small, going back from GPT5 to our previous generation model is just so painful. It's just like worse at everything. And I've taken for granted that

##### Embedding Generation and Storage in a Vector Store

In [None]:
#using the following embedding model
embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")
#we provide our chunks along with our embedding model to the vector store to create an embedding for each chunk
vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
#we get an embedding id against all our chunks, which can be used to call an individual chunk
vector_store.index_to_docstore_id

{0: 'c9f4bfd7-699e-4126-ab71-28a4651b3aad',
 1: '852cbb1f-bf84-4640-86e1-12a40590a47c',
 2: '36d1fd07-7dbe-49a1-acd0-f7661dca5098',
 3: '55886031-1a68-46ab-8225-122503942837',
 4: 'd4dea40a-c463-4194-b052-e1458f8c5cc3',
 5: 'dce88219-534e-4e6e-8c5e-69c297f5d6ad',
 6: 'e7b8387b-47f6-4af6-965b-82a3794767c3',
 7: '3f2e54af-d157-4816-b3ae-617660a8cb96',
 8: '2aacb296-a4bc-4fe6-8052-5b1661f510a3',
 9: '180347ac-ff31-4860-81c7-88f027cb3713',
 10: '9786dc87-69cd-45da-b9a7-254f72b9adde',
 11: '82e483a0-868d-409f-9903-0a46146d3f47',
 12: '149f98ba-5208-4e5a-9339-66c4f0b9badd',
 13: '4975148f-a3a7-47f0-96f0-a983e89ddc58',
 14: '98c0ae78-2262-46ee-8e8e-9d415edb63f1',
 15: 'ec0278e8-bd44-4af8-962c-5f426dbe908a',
 16: '5185da90-a8db-46c0-a59d-ccab0255fecd',
 17: '2b4a7edc-999a-4539-9ceb-ffa2e774c1bf',
 18: '34bc0535-b49f-424c-8fdf-8d22131ee5d7',
 19: '4db99a90-1f6a-4805-89c9-3f251957a074',
 20: '1c2d752e-be94-4e4d-a5eb-eb4405ab1cdd',
 21: 'ac38c37a-45c6-4417-a1fb-5b321e9a07dc',
 22: '776def7b-f50a-

In [41]:
vector_store.get_by_ids(['44448d11-e485-47f5-a0f5-0875921efb55'])

[Document(id='44448d11-e485-47f5-a0f5-0875921efb55', metadata={}, page_content='not necessarily what other people have. If we all had more, do you think we would still want more if we all had enough? I I do sort of think that human demand, desire, ability to play status games, whatever is is like it seems pretty limitless. Um I don\'t think that\'s necessarily bad. Uh or not all bad. Um but yeah, I think we will figure out new things to want and new things to compete over. Do you think the world retains largely the world retains the current model of capitalism and democracy in a way? Let me give you a scenario. What happens if a company X let\'s say open AI gets to the point where it is 50% of world GDP. Does society allow for that or I would bet not. I don\'t think that will happen. I think this will be a much more distributed thing. But if for some reason that did happen, I think society would say, "We don\'t think so." Like, "Let\'s figure out let\'s figure out something to do here.

##### Step 2: Retrievel

In [43]:
# Forming a retriever using our vector store, that will use a simple similarity search and get 4 most similar results in the output.
#
retriever = vector_store.as_retriever(search_type = "similarity", search_kwargs = {"k": 4})
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x125999e80>, search_kwargs={'k': 4})

In [None]:
# Providing a query to the retriever to find the most similar documents. This query is converted into an embedding and then semantic search is performed.
#
retriever.invoke("What is the future of Robotics?")

[Document(id='05d7de7e-a440-4d0b-8c71-107442a75d7e', metadata={}, page_content="a thousand hour problem but certainly in the world today we cannot at all and so that that's like another dimension where AI can't do it. So I was in the US between SF between San Francisco and New York the last couple of months and I met a whole bunch of AI founders. Uh the one thing everybody seemed to agree on is like for AI the US is a few years ahead of most others. They also thought that for robotics China seems to be ahead. Uh do you have a view on robotics and what happens there like humanoid or other form of robotics? I think it will be incredibly important in a couple of years. I think uh one of the things that is going to feel most AGI like is seeing robots just walk by you on the street doing kind of normal day-to-day tasks. Um is there a reason why they need to have humanlike form? Well, you can certainly have non-humanoid forms, but the world is really built for humans, you know, like door han

##### Step 3: Augmentation

In [None]:
# Forming an LLM
#
llm = ChatOpenAI(model = "gpt-4o-mini", temperature=0.2)

In [47]:
# Creating a prompt template with user query and retrieved context(documents) to be provided to the LLM
#
prompt = PromptTemplate(
    template = """
        You are a helpful assistant.
        Answer ONLY from the provided transcript context.
        If the context is insufficient, just say you do not know.

        {context}
        Question: {question}
    """,
    input_variables= ['context', 'question']
)

In [48]:
question = "Is the topic of Parenthood discussed in this video? If yes what was discussed?"
retrieved_docs = retriever.invoke(question)
retrieved_docs

[Document(id='1c2d752e-be94-4e4d-a5eb-eb4405ab1cdd', metadata={}, page_content="this than most because of your role at Y Combinator. I have a lot of data points on it at least. Yeah. Yeah. Uh when we met in Washington a couple of days a couple of years ago at the White House, I remember when we were speaking and you went somewhere, I was speaking to your partner and you guys had a kid. We did. And how is that? Uh it is my favorite thing ever. But I mean I I know that like I have nothing that is not a cliche to say here. Um, but it is the coolest, most amazing, most like emotionally overwhelming in the best ways and hard ways to uh experience every everything said about how great it is, how intense it is, how it's like a kind of love you didn't know you could feel. It's all it's all true. I have nothing to add other than I strongly recommend it and I think uh it's been really wonderful. It's amazing. So I ponder on this a lot Sam uh kids, why people have kids and also questions like wha

In [50]:
#creating context from the page_content of retrieved docs to be sent to the LLM for generation
#
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"this than most because of your role at Y Combinator. I have a lot of data points on it at least. Yeah. Yeah. Uh when we met in Washington a couple of days a couple of years ago at the White House, I remember when we were speaking and you went somewhere, I was speaking to your partner and you guys had a kid. We did. And how is that? Uh it is my favorite thing ever. But I mean I I know that like I have nothing that is not a cliche to say here. Um, but it is the coolest, most amazing, most like emotionally overwhelming in the best ways and hard ways to uh experience every everything said about how great it is, how intense it is, how it's like a kind of love you didn't know you could feel. It's all it's all true. I have nothing to add other than I strongly recommend it and I think uh it's been really wonderful. It's amazing. So I ponder on this a lot Sam uh kids, why people have kids and also questions like what happens to religion and marriage tomorrow? Can I ask you why you had a kid?\n

In [None]:
# Combining context and question into the prompt
#
final_prompt = prompt.invoke({"context": context_text, "question": question})