# **Chat with any YouTube video using RAG and Claude**

In [None]:
import os

%env ANTHROPIC_API_KEY = YOUR_API_KEY

CLAUDE_API_KEY = os.getenv("ANTHROPIC_API_KEY")

In [2]:
# Installing langchain anthropic (for Claude)
!pip install langchain-anthropic -q
!pip install langchain -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m871.1/871.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [3]:
from langchain_anthropic import ChatAnthropic
model = ChatAnthropic(model='claude-3-haiku-20240307', )

In [4]:
# Testing the model
model.invoke("What is the capital of Brazil?")

AIMessage(content='The capital of Brazil is Brasília.', response_metadata={'id': 'msg_01GP4iVEzznK3epf5TfaTUae', 'model': 'claude-3-haiku-20240307', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 14, 'output_tokens': 12}}, id='run-9779ae8f-c8c5-44af-a4d1-20d5e5c00275-0')

In [5]:
# To get only the string as output instead of the whole AIMessage...
from langchain_core.output_parsers import StrOutputParser

# Defining the string parser
parser = StrOutputParser()

# Defining the chain
chain = model | parser              # Output of model is passed as input to parser (chaining functions)
chain.invoke("What is the capital of Brazil? Answer in 1 word.")

'Brasília.'

### **Adding a prompt template to modify it's behaviour**

In [24]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context given below. Only answer the question if you're confident. If you do not know the answer, then reply with "I'm sorry, I don't know the answer to this". Your job is to act like a helpful humanoid assistant who provides short but crisp answers.

Context: {context}

Question: {question}
"""

In [25]:
prompt = ChatPromptTemplate.from_template(template)     # Adding the prompt from the template

# Adding the context and the prompt
prompt.format(
    context = "John's favourite colour is red, Jogn married Jane, she hates red but loves yellow!",
    question = "Who is Jane?"
    )

'Human: \nAnswer the question based on the context given below. Only answer the question if you\'re confident. If you do not know the answer, then reply with "I\'m sorry, I don\'t know the answer to this". Your job is to act like a helpful humanoid assistant who provides short but crisp answers. No need to mention \'context-provided\'\n\nContext: John\'s favourite colour is red, Jogn married Jane, she hates red but loves yellow!\n\nQuestion: Who is Jane?\n'

In [26]:
# Extending the chain
chain = prompt | model | parser

chain.invoke({
    "context": "John's favourite colour is red, John married Jane, she hates red but loves yellow!",
    "question": "Who is Jane?"
})

"Jane is John's wife."

### **Combining Multiple Chains**

In [27]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}, act like the translated output is the only output."
)

In [28]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")}  | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "John's favourite colour is red, John married Jane, she hates red but loves yellow!",
        "question": "Who is Jane?",
        "language": "French"
    }
)

'Jane est la femme de John.'

## **Transcribing YouTube Videos**

In [29]:
!pip install pytube -q
!pip install openai-whisper -q

In [12]:
import os
import whisper
from pytube import YouTube

YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=P6FORpg0KVo"

# Check if the transcription file already exists
if not os.path.exists("transcription.txt"):
    # Create a YouTube object
    youtube = YouTube(YOUTUBE_VIDEO)

    # Get the audio stream
    audio = youtube.streams.filter(only_audio=True).first()

    # Load the base Whisper model
    whisper_model = whisper.load_model("base")

    # Define the path to download the audio file
    current_folder = os.getcwd()
    audio_file_path = os.path.join(current_folder, "downloaded_audio.mp4")

    # Download the audio file
    print("Downloading audio...")
    audio.download(output_path=current_folder, filename="downloaded_audio.mp4")
    print(f"Audio downloaded to {audio_file_path}")

    # Transcribe the downloaded audio file
    print("Transcribing audio...")
    transcription = whisper_model.transcribe(audio_file_path, fp16=False)["text"].strip()

    # Write the transcription to a text file
    with open("transcription.txt", "w") as file:
        file.write(transcription)
    print("Transcription completed and saved to transcription.txt")

    # Remove the downloaded audio file
    os.remove(audio_file_path)
    print("Temporary audio file removed")
else:
    print("Transcription file already exists.")

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 128MiB/s]


Downloading audio...
Audio downloaded to /content/downloaded_audio.mp4
Transcribing audio...
Transcription completed and saved to transcription.txt
Temporary audio file removed


### **Using the Entire Transcript as Context**

We need to split the transcription as the models cannot take the entire transcript as context directly

In [30]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()        # Contains the entire text transcript

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Splitting the transcript into chunks of 1000 with an overlap of 50
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
text_splitter.split_documents(text_documents)[:2]       # Checking the first 2 chunks

[Document(page_content="So, I'm from Guatemala. This is a public service announcement that is word Guatemala is. Also, that is not where they keep the prisoners that is called Guantanamo. Not the same place. So, Guatemala is right below Mexico and for the Americans in the audience and let the sink in because it really applies in most ways. For the Americans in the audience, you can think of it as Mexico's Mexico. Just like the US doesn't want illegal immigration from Mexico, Mexico doesn't want illegal immigration from Guatemala. It's a smaller country. It's a poorer country and well, what can I tell you? It has much better Mexican food. Guatemala is a very poor country and a lot of people talk about education as something that brings equality to different social classes. But I always saw it as the opposite, as something that brings inequality. Because what happens in practice is that people who have a lot of money and by themselves be really good education and therefore continue havin

## **Finding the relevant chunks with embeddings**

But how do we find the right chunks of data that contains the relevant information when a user asks a question?

We convert the chunks and the user question to embeddings & find the k-nearest chunk embeddings to the user question's embeddings. Nearby chunks have the highest similarity, which is why we use this approach to find the relevant context for our model.

In [32]:
!pip install sentence-transformers -q

In [33]:
!pip install faiss-cpu
!pip install faiss-gpu



In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Read the text from a file
with open("transcription.txt", "r") as f:
    text = f.read()

# Splitting the text into chunks of 1000 with an overlap of 50
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
documents = text_splitter.split_documents(text_documents)

# Load the HuggingFace embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store from the text chunks
vector_store = FAISS.from_texts(texts=text_splitter.split_text(text), embedding=hf_embeddings)

# Now you can use the vector store for semantic search or other tasks
query = "Where is the speaker from?"
results = vector_store.similarity_search(query, k=2)

print(f"Results for query '{query}':")
for result in results:
    print(result.page_content)

Results for query 'Where is the speaker from?':
a mobile phone or a smartphone in particular. See, building schools all over the world is simple to expensive. On the other hand, most of the world's population already has access to a smartphone and the trend is that that fraction is only going to increase. So we decided at the time that we would make a way to learn foreign languages on a mobile phone that was accessible to everyone. And then we called it Duolingo. Now, in order to truly be accessible to everyone, rich and poor, Duolingo uses a freemium model to support itself. What that means is that you can learn as much as you want without ever having to pay. But if you don't pay, you may have to see an ad at the end of a lesson. Now, if you don't like ads, you can also pay to subscribe to turn off the ads. And it turns out that the vast majority of the revenue for Duolingo comes from people to pay to subscribe to turn off the ads. Now, who are these people who pay to subscribe to tur

See how the chunks related to Guatemala (the place the speaker is from) are retrieved!

## **Connecting to Pinecone vector database (Optional)**

In [35]:
!pip install langchain_pinecone -q
!pip install pinecone



In [None]:
%env PINECONE_API_KEY = YOUR_API_KEY
%env PINECONE_ENV = youtube_rag

In [37]:
import pinecone

# Create an instance of the Pinecone class
pinecone_client = pinecone.Pinecone(
    api_key = os.getenv("PINECONE_API_KEY"),
    environment = os.getenv("PINECONE_ENV")
)

In [38]:
from langchain_pinecone import PineconeVectorStore

index_name = "yt-rag"

# To store our chunks of data and the embeddings in Pinecone
pinecone = PineconeVectorStore.from_documents(
    documents,
    hf_embeddings,
    index_name=index_name
)

In [39]:
# Let's test it
pinecone.similarity_search("Tell me about the ideology behind Duolingo")[:1]

[Document(page_content="a mobile phone or a smartphone in particular. See, building schools all over the world is simple to expensive. On the other hand, most of the world's population already has access to a smartphone and the trend is that that fraction is only going to increase. So we decided at the time that we would make a way to learn foreign languages on a mobile phone that was accessible to everyone. And then we called it Duolingo. Now, in order to truly be accessible to everyone, rich and poor, Duolingo uses a freemium model to support itself. What that means is that you can learn as much as you want without ever having to pay. But if you don't pay, you may have to see an ad at the end of a lesson. Now, if you don't like ads, you can also pay to subscribe to turn off the ads. And it turns out that the vast majority of the revenue for Duolingo comes from people to pay to subscribe to turn off the ads. Now, who are these people who pay to subscribe to turn off the ads? Well, the

## **Creating a chain with everything connected**

In [40]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What is the philosophy behind Duolingo?")

"Based on the given context, the philosophy behind Duolingo seems to be making high-quality education accessible to everyone, both rich and poor, through the use of mobile phones and a freemium business model.\n\nThe key points are:\n\n1. Duolingo was created with the goal of making it possible to learn foreign languages on a mobile phone, as most of the world's population already has access to smartphones.\n\n2. Duolingo uses a freemium model, where users can learn for free but can optionally pay to remove ads. The majority of Duolingo's revenue comes from these paid subscriptions.\n\n3. The aim is to make education accessible to everyone, regardless of their financial status, by leveraging the widespread availability of smartphones."