In [2]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \ faiss-cpu tiktoken python-dotenv langchain_groq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/131.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/131.4 kB[0m [31m879.5 kB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m112.6/131.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.4/131.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings

## Step 1a - Indexing (Document Ingestion)

In [65]:
video_id = "3dhcmeOTZ_Q" # only the ID, not full URL
try:
    # If you don’t care which language, this returns the “best” one
    transcript_list = YouTubeTranscriptApi().fetch(video_id, languages=["en"])

    # Flatten it to plain text
    transcript = " ".join(chunk.text for chunk in transcript_list)

except TranscriptsDisabled:
    print("No captions available for this video.")

In [66]:
transcript

"[Music] linear regression is a statistical technique for modeling the relationship between an output variable and one or more input variables in layman's terms think of it as fitting a line through some data points as shown here so you can make predictions on unknown data assuming there is a linear relationship between the variables you might be familiar with the linear function y equals mx plus b where y is the output variable also called the dependent variable you may also see expressed as f x the function of the input variable x on the other hand would serve as the input variable also called the independent variable it's likely you'll see the coefficients m and b expressed as beta 1 and beta0 respectively so what do the m and b coefficients do the m or beta 1 coefficient controls the slope of the line the b or the beta 0 controls the intercept of the line in machine learning we also know it as the bias these two coefficients are what we are solving for in linear regression we can a

## Step 1b - Indexing (Text Splitting)

In [73]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [74]:
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=HuggingFaceEmbeddings(),
)

  embedding=HuggingFaceEmbeddings(),


In [75]:
vectorstore.index_to_docstore_id

{0: '7e4d1d70-ee4d-42ec-a614-77e5297aec47',
 1: 'd392421a-212d-414f-9442-c66af4fd27f9',
 2: '5b0d9e45-32bb-4a51-a61f-dbac85592441',
 3: '3fec63e1-917f-4886-b159-1aaf6e20b071'}

## Step 2 - Retrieval

In [76]:
retriever = vectorstore.as_retriever(search_type = 'similarity', search_kwargs={"k": 4})

In [77]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f9bb92c0090>, search_kwargs={'k': 4})

## Step 3 - Augmentation

In [None]:
llm = ChatGroq(model="llama3-8b-8192", temperature=0.2)

In [33]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [79]:
question          = "what is linear regression"
retrieved_docs    = retriever.invoke(question)

In [80]:
retrieved_docs

[Document(id='7e4d1d70-ee4d-42ec-a614-77e5297aec47', metadata={}, page_content="[Music] linear regression is a statistical technique for modeling the relationship between an output variable and one or more input variables in layman's terms think of it as fitting a line through some data points as shown here so you can make predictions on unknown data assuming there is a linear relationship between the variables you might be familiar with the linear function y equals mx plus b where y is the output variable also called the dependent variable you may also see expressed as f x the function of the input variable x on the other hand would serve as the input variable also called the independent variable it's likely you'll see the coefficients m and b expressed as beta 1 and beta0 respectively so what do the m and b coefficients do the m or beta 1 coefficient controls the slope of the line the b or the beta 0 controls the intercept of the line in machine learning we also know it as the bias t

In [81]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [83]:
context_text

"[Music] linear regression is a statistical technique for modeling the relationship between an output variable and one or more input variables in layman's terms think of it as fitting a line through some data points as shown here so you can make predictions on unknown data assuming there is a linear relationship between the variables you might be familiar with the linear function y equals mx plus b where y is the output variable also called the dependent variable you may also see expressed as f x the function of the input variable x on the other hand would serve as the input variable also called the independent variable it's likely you'll see the coefficients m and b expressed as beta 1 and beta0 respectively so what do the m and b coefficients do the m or beta 1 coefficient controls the slope of the line the b or the beta 0 controls the intercept of the line in machine learning we also know it as the bias these two coefficients are what we are solving for in linear regression we can\n

In [82]:
final_prompt = prompt.invoke({'context':context_text,'question':question})

## Step 4 - Generation

In [84]:
result = llm.invoke(final_prompt)

In [85]:
result.content

"Linear regression is a statistical technique for modeling the relationship between an output variable and one or more input variables. It's like fitting a line through some data points to make predictions on unknown data, assuming there is a linear relationship between the variables."

## Building a Chain

In [86]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [87]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [88]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [91]:
parser = StrOutputParser()

In [92]:
main_chain = parallel_chain | prompt | llm | parser

In [94]:
main_chain.invoke('explain multiple inputs in linear regression')

'In linear regression, we can extend multiple input variables so x1, x2, x3, with beta1, beta2, and beta3, and so on, acting as slopes for each of those variables in these higher dimensions.'