In [37]:
from dotenv import load_dotenv

load_dotenv()

True

In [38]:
import yt_dlp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

In [None]:
video_url = "https://www.youtube.com/watch?v=wjZofJX0v4M&list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi&index=6"

ydl_opts = {
    'skip_download': True,       
    'writesubtitles': True,      
    'subtitleslangs': ['en'],    
    'subtitlesformat': 'vtt',    
    'outtmpl': 'subtitle.%(ext)s'
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])

In [49]:
vtt_file = "subtitle.en.vtt"
txt_file = "transcript.txt"

transcript_lines = []

with open(vtt_file,"r",encoding="utf-8") as f:
  for line in f:
    line = line.strip()

    if not line or line.startswith("WEBVTT") or "-->" in line:
      continue
    transcript_lines.append(line)

transcript = " ".join(transcript_lines)

with open(txt_file,"w",encoding="utf-8") as f:
  f.write(transcript)

print("Transcript saved to ",txt_file)
print(transcript[:500],"...")

Transcript saved to  transcript.txt
Over the last few years, AI systems have become astonishingly good at turning text props into videos. At the core of how these models operate is a deep connection to physics. This generation of image and video models works using a process known as diffusion, which is remarkably equivalent to the Brownian motion we see as particles diffuse, but with time run backwards, and in high-dimensional space. As we'll see, this connection to physics is much more than a curiosity. We get real algorithms out ...


In [50]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
chunks = splitter.create_documents([transcript])

In [51]:
len(chunks)

94

In [52]:
chunks[0]

Document(metadata={}, page_content="Over the last few years, AI systems have become astonishingly good at turning text props into videos. At the core of how these models operate is a deep connection to physics. This generation of image and video models works using a process known as diffusion, which is remarkably equivalent to the Brownian motion we see as particles diffuse, but with time run backwards, and in high-dimensional space. As we'll see, this connection to physics is much more than a curiosity. We get real algorithms out")

In [53]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks,embeddings)

In [54]:
vector_store.index_to_docstore_id

{0: 'b29ff23c-0783-4f65-bef4-28aa8c8c27ad',
 1: 'ee71796f-46b5-4c05-a072-b41f87856e6e',
 2: 'cfac9cbf-4164-41f2-b616-f9eeaccee6b1',
 3: 'c766d092-8735-439b-b2a3-18c54d459803',
 4: 'e1f1b1ab-a7f8-4db3-8552-e70dfe43c99b',
 5: '87f8a4ff-a0f9-4304-bc4f-d29ce2ad51b5',
 6: '2cd65284-a213-476f-ac98-1f546e99dabb',
 7: 'b47c963e-a691-4498-9d97-5508e1f7d564',
 8: '5730a9b7-dada-4b61-aabf-e9d6df140978',
 9: '73633705-d1d2-45d8-93f4-232b7ef1f828',
 10: '00027290-cdf3-42a6-bfae-83833e45ba68',
 11: 'd7e5b8fd-03eb-431a-ae52-e1c491667102',
 12: '30345453-ebe5-4ad3-8b72-91185f128e47',
 13: '90baea94-08e3-4b20-a873-9f921d474798',
 14: '12fe665c-b356-4a20-a65f-0a32e45f6a6a',
 15: '98b7550c-b725-4792-a708-f4829737f8f3',
 16: '235b5589-beae-4f1a-8d76-f556f21ccdfd',
 17: 'ea19144e-2878-4bb3-a264-a99942279f63',
 18: '0d02c0fb-0bfe-4d6e-a4b1-9372dff9fde5',
 19: 'ed5991ca-c360-4d2a-b47b-7bc823346ea1',
 20: 'd0175a0e-fbf3-4f0a-bdb5-cf743135ece4',
 21: '0a6d6d2d-a412-4928-8ea2-848801cd4a54',
 22: '3ea01d27-7273-

In [55]:
vector_store.get_by_ids(['b29ff23c-0783-4f65-bef4-28aa8c8c27ad'])

[Document(id='b29ff23c-0783-4f65-bef4-28aa8c8c27ad', metadata={}, page_content="Over the last few years, AI systems have become astonishingly good at turning text props into videos. At the core of how these models operate is a deep connection to physics. This generation of image and video models works using a process known as diffusion, which is remarkably equivalent to the Brownian motion we see as particles diffuse, but with time run backwards, and in high-dimensional space. As we'll see, this connection to physics is much more than a curiosity. We get real algorithms out")]

In [56]:
retriever = vector_store.as_retriever(search_type="similarity",search_kwargs={"k":4})

In [57]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000273A1BFB0D0>, search_kwargs={'k': 4})

In [58]:
retriever.invoke("What are transformers")

[Document(id='c766d092-8735-439b-b2a3-18c54d459803', metadata={}, page_content="model's source code, we'll find that the video generation process begins with this call to a random number generator. Creating a video where the pixel intensity values are chosen randomly. Here's what it looks like. From here, this pure noise video is passed into a transformer. This is the same type of AI model used by large language models, like ChatGPT. But instead of outputting text, this transformer outputs another video that now looks like this. Still mostly noise, but with some hints of"),
 Document(id='e1f1b1ab-a7f8-4db3-8552-e70dfe43c99b', metadata={}, page_content="outputs another video that now looks like this. Still mostly noise, but with some hints of structure. This new video is added to our pure noise video, and then passed back into the model again, producing a third video that looks like this. This process is repeated again and again. Here's what the video looks like after 5 iterations, 10, 

In [69]:
llm = ChatOpenAI(model="gpt-4",temperature=0.2)

In [60]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [71]:
question          = "is the topic of transformers discussed in this video? if yes then what was discussed"
retrieved_docs    = retriever.invoke(question)

In [72]:
retrieved_docs

[Document(id='c766d092-8735-439b-b2a3-18c54d459803', metadata={}, page_content="model's source code, we'll find that the video generation process begins with this call to a random number generator. Creating a video where the pixel intensity values are chosen randomly. Here's what it looks like. From here, this pure noise video is passed into a transformer. This is the same type of AI model used by large language models, like ChatGPT. But instead of outputting text, this transformer outputs another video that now looks like this. Still mostly noise, but with some hints of"),
 Document(id='e1f1b1ab-a7f8-4db3-8552-e70dfe43c99b', metadata={}, page_content="outputs another video that now looks like this. Still mostly noise, but with some hints of structure. This new video is added to our pure noise video, and then passed back into the model again, producing a third video that looks like this. This process is repeated again and again. Here's what the video looks like after 5 iterations, 10, 

In [73]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"model's source code, we'll find that the video generation process begins with this call to a random number generator. Creating a video where the pixel intensity values are chosen randomly. Here's what it looks like. From here, this pure noise video is passed into a transformer. This is the same type of AI model used by large language models, like ChatGPT. But instead of outputting text, this transformer outputs another video that now looks like this. Still mostly noise, but with some hints of\n\noutputs another video that now looks like this. Still mostly noise, but with some hints of structure. This new video is added to our pure noise video, and then passed back into the model again, producing a third video that looks like this. This process is repeated again and again. Here's what the video looks like after 5 iterations, 10, 20, 30, 40, and finally 50. Step by step, our transformer shapes pure noise into incredibly realistic video. But what exactly is the connection to Brownian\n\n

In [74]:
final_prompt = prompt.invoke({"context":context_text,"question":question})

In [75]:
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      model's source code, we'll find that the video generation process begins with this call to a random number generator. Creating a video where the pixel intensity values are chosen randomly. Here's what it looks like. From here, this pure noise video is passed into a transformer. This is the same type of AI model used by large language models, like ChatGPT. But instead of outputting text, this transformer outputs another video that now looks like this. Still mostly noise, but with some hints of\n\noutputs another video that now looks like this. Still mostly noise, but with some hints of structure. This new video is added to our pure noise video, and then passed back into the model again, producing a third video that looks like this. This process is repeated again and again. Here's what the video look

In [76]:
answer = llm.invoke(final_prompt)
print(answer)

content='Yes, transformers are discussed in this video. The video generation process involves passing a pure noise video into a transformer, which is the same type of AI model used by large language models like ChatGPT. However, instead of outputting text, this transformer outputs another video. This new video is added to the original pure noise video and then passed back into the model again. This process is repeated multiple times, with the transformer gradually shaping the pure noise into a realistic video.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 95, 'prompt_tokens': 466, 'total_tokens': 561, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4-0613', 'system_fingerprint': None, 'id': 'chatcmpl-CRjCMlkLDHDdA1TWB9XBa1BMn39wn', 'service_tier': 'default', 'finish

### Chaining

In [77]:
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda 
from langchain_core.output_parsers import StrOutputParser

In [78]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [79]:
parallel_chain = RunnableParallel({
  'context' : retriever | RunnableLambda(format_docs),
  'question' : RunnablePassthrough() 
})

In [80]:
parallel_chain.invoke('What are embeddings')

{'context': "text or images into embedding vectors. These two problems potentially fit together in a really interesting way. Diffusion models are able to potentially reverse the CLIP image encoder, generating high quality images, and the output vector of the CLIP text encoder could be used to guide our diffusion models toward the images or videos that we want. So the high level idea here is that we could pass in a prompt into the CLIP text encoder to generate an embedding vector, and use this embedding\n\nspace allows us to operate mathematically on the pure ideas or concepts in our images and text, translating the differences in the content of our images, like if there's a hat or not, into a literal distance between vectors in our embedding space. The OpenAI team showed that CLIP could produce very impressive image classification results by simply passing an image into our image encoder, and then comparing the resulting vector to a set of possible captions, one for each label that cou

In [81]:
parser = StrOutputParser()

In [82]:
main_chain = parallel_chain | prompt | llm | parser

In [83]:
main_chain.invoke('Can you summarize this video')

'The video discusses the process of creating realistic video from pure noise using a transformer model. This process involves multiple iterations of adding a new video to the noise video and passing it back into the model. The video also discusses the use of negative prompts to steer the diffusion process away from unwanted features. The model uses a text input to shape the noise into what the prompt describes. The video also mentions a 2021 OpenAI paper and model called CLIP, which is a combination of a language model and a vision model.'