# Build a QA RAG chain using langchain

## Install necessary dependencies/libraries

In [None]:
!pip install -qU langchain langchain-openai langchain-community langchain-text-splitters youtube_transcript_api faiss-cpu google-api-python-client

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

## Load open AI api key and google api key

In [None]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()


In [None]:
os.environ["api_key"] = getpass.getpass()


In [None]:
from googleapiclient.discovery import build
import os

youtube = build("youtube", "v3", developerKey=os.getenv("api_key"))

def get_video_ids_from_playlist(playlist_id):
    video_ids = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50, 
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["snippet"]["resourceId"]["videoId"])

        next_page_token = response.get("nextPageToken")

        if not next_page_token:
            break

    return video_ids

## Provide a playlist/ multiple playlist ids to retrieve their video ids

In [None]:
playlist_ids = ["PL2yQDdvlhXf_aPLMfxECsw-UIbEg6uy42"]
video_ids = []
for playlist in playlist_ids:
    video_ids.extend(get_video_ids_from_playlist(playlist))


In [None]:
video_ids

## Process video transcripts:

```json
[
    {
        "video_id": "aim5x73crbM",
        "transcript": [
            {
                "text": "[MUSIC]",
                "start": 0.3,
                "duration": 1.968
            },
            {
                "text": "TAKE ONE AND ACTION WERNER.\nTODAY WE ARE EXPLORING THE",
                "start": 2.268,
                "duration": 6.24
            }
        ]
    }
]



In [None]:
formatter = JSONFormatter()
aws_reinvent_genai_transcripts = []

def process_video_transcripts(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)       
        formatted_transcript = formatter.format_transcript(transcript)
                                        
        aws_reinvent_genai_transcripts.append({
            "video_id": video_id,
            "transcript": json.loads(formatted_transcript )
        })
    
    except Exception as e:
        print(f"Failed to retrieve transcript for {video_id}: {e}")

    return aws_reinvent_genai_transcripts

In [None]:
for video in video_ids:
    process_video_transcripts(video)
    
formatted_aws_reinvent_genai_transcripts = json.dumps(aws_reinvent_genai_transcripts)
print(formatted_aws_reinvent_genai_transcripts)

## Split the documents into chunks

In [None]:
def process_documents(transcript_data):
    full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])

    text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
    documents = text_splitter.create_documents([full_transcript], metadatas=[{"video_id": transcript_data['video_id']}])
    return documents

In [None]:
transcripts= json.loads(formatted_aws_reinvent_genai_transcripts)
docs = []
for transcript_data in transcripts:
    docs.extend(process_documents(transcript_data))
    print(len(docs))

## Create embeddings and index for these chunked documents

In [None]:
embeddings = OpenAIEmbeddings()
faiss_index = FAISS.from_documents(docs, embeddings)

In [None]:
retriever = faiss_index.as_retriever()

In [None]:
results = retriever.invoke("Who are the featured keynote speakers for AWS re:Invent 2024?")

In [None]:
for result in results:
    print(f"{result}\n")

## Generate prompt template

In [None]:
template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to 
    answer the question. If you don't know the answer, just say that you don't know. Give 4 or 5 points  maximum 
    and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:
    """
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=template
)

## Define LLM and your QA chain

In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.7)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

context = retriever
print(f"Context after retriever:\n {context}\n")

context = retriever | format_docs
print(f"Context after retriever and format docs:\n {context}")



In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
from IPython.display import display, HTML


# Start querying

In [None]:
from IPython.display import display, HTML

while True:
    query = input ("Please enter your query: ")
    display(HTML(f'<span style="color: yellow;font-size: 18px; font-family: Arial, sans-serif;">\n {query} \n</span>'))
    response = rag_chain.invoke(input=query)  
    display(HTML(f'<span style="color: green;font-size: 18px; font-family: Arial, sans-serif; font-weight: bold;">\n {response} \n</span>'))
    if query.lower() == 'exit':
        break
