In [None]:
import os
os.environ["OPENAI_API_KEY"]= "your_api_key_here"

## Install Libraries

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain_openai

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

## Step 1a- Indexing (Document Ingestion)

In [None]:
video_id= "Lpzn98BDOjwQs"   # only video id required, not the full url

try:
  # This 'YouTubeTranscriptApi' internally uses it's own youtube api here, and so we dont need to create (or use) our own apis for this
  transcript_list= YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])     # Here, 'en' is the language of the transcript we want to get.
                                                                                       # If the transcript is not avialable in the language then it will throw an error.

  # Flatten it into plain text
  transcript= " ".join(d['text'] for d in transcript_list)
  print(transcript)

except:
  print("Transcript disabled for this video")

## Step 1b - Indexing (Text Splitting)

In [None]:
splitter= RecursiveCharacterTextSplitter(chunk_size= 1000, chunk_overlap= 200)
chunks= splitter.create_documents([transcript])

In [None]:
len(chunks)

In [None]:
chunks[0]

## Step 1c & 1d- Indexing (Embedding Generation and storing in Vector Store)

In [None]:
embeddings= OpenAIEmbeddings(model= "text-embedding-3-small")
vector_store= FAISS.from_documents(chunks, embeddings)

## Step 2- Retrieval

In [None]:
retriver= vectore_store.as_retriever(
    search_type= "similarity",
    search_kwargs= {"k": 4}
)

retriever

Step 3- Augmentation

In [None]:
llm= ChatOpenAI(model= "gpt-3.5-turbo", temperature= 0.2)

In [None]:
prompt= PromptTemplate(
    template= """
      You are a helpful assistant.
      Answer ONLY from the provided context.
      If the context is insufficient, just say DON'T KNOW.
      Context: {context}
      Question: {question}
    """,
    input_vriables= ["context", "question"]
)

In [None]:
question= "Is the topic of alliens discussed in this video. If yes, then what was discussed"
retrieved_docs= retriver.invoke(question)

In [None]:
# Since we cannot give 4 different content to our prompt (but our retriever is returning 4 different content), so we are merging those 4 prompts to make it one.
context_text= "\n\n".join(doc.page_content for doc in retrieved_docs)

In [None]:
final_prompt= prompt.invoke(
    {
        "context": context_text,
        "question": question
    }
)

## Step 4- Generation

In [None]:
answer= llm.invoke(final_prompt)
print(answer.content)

## Done till here!!

## Now instead of doing all the workflow manually, we gonna create a pipeline chain for this.

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
  context_text= "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain= RunnableParallel({
    "context": retriever | RunnableLambda(format_docs),
    "question": RunnablePassthrough()
})

In [None]:
# Domo of how parallel chain works
parallel_chain.invoke("Who is Denis")

In [None]:
parser= StrOutputParser()

In [None]:
main_chain= parallel_chain | prompt | llm | parser

In [None]:
query= "Can you summrize the video?"

result= main_chain.invoke(query)
print(result)