### Import Libraries

In [1]:
import os
import re
import time
from pprint import pprint
from langchain import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

### Setup API & Model

In [None]:
# Set up your API key
os.environ["GOOGLE_API_KEY"] = ""

# Set up Google Gemini LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5, max_tokens=500)
response = llm.invoke("Hi, How are you?")
print(response.content)
print(response.usage_metadata)

I am doing well, thank you for asking! How are you today?
{'input_tokens': 6, 'output_tokens': 16, 'total_tokens': 22, 'input_token_details': {'cache_read': 0}}


In [5]:
def extract_youtube_video_id(url: str) -> str:
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11})(?:[\?&\/]|$)"
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None
    
# Example usage
yt_url = "https://www.youtube.com/watch?v=z-moiQlcC6c&list=PLv8Cp2NvcY8AzNCATbDWMr8vqbJBYbxFW&index=8"
video_id = extract_youtube_video_id(yt_url)
print("üéØ Extracted Video ID:", video_id)

üéØ Extracted Video ID: z-moiQlcC6c


In [6]:
video_id = video_id
priority_lang = "en"
max_retries = 5


def retry_get_transcript(video_id, lang_code, retries=5):
    for attempt in range(retries):
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
            return transcript
        except Exception as e:
            print(f"üîÅ Retrying {lang_code} ({attempt + 1}/{retries}) due to error: {e}")
            time.sleep(2)
    return None


def get_transcript_with_priority(video_id, priority_lang="en", retries=5):
    try:
        # Step 1: Get transcript list
        transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
        all_langs = [t.language_code for t in transcripts]
        print(f"üéØ Available languages: {all_langs}")

        # Step 3: Try preferred language with retries
        if priority_lang in all_langs:
            transcript_chunks = retry_get_transcript(video_id, priority_lang, retries)
            if transcript_chunks:
                print(f"‚úÖ Found transcript in preferred language: {priority_lang}")
                transcript = " ".join(chunk["text"] for chunk in transcript_chunks)

            return transcript_chunks, transcript

        # Step 4: Fallback to first available language with retries
        if all_langs:
            fallback_lang = all_langs[0]
            transcript_chunks = retry_get_transcript(video_id, fallback_lang, retries)
            if transcript_chunks:
                print(f"‚ö†Ô∏è Preferred language not found. Falling back to: {fallback_lang}")
                transcript = " ".join(chunk["text"] for chunk in transcript_chunks)

            return transcript_chunks, transcript


    except TranscriptsDisabled:
        print("‚ùå Transcripts are disabled for this video.")
    except Exception as e:
        print(f"‚ùó Unexpected error: {e}")

    return None

# üîΩ Run the logic
transcript_list, transcript = get_transcript_with_priority(video_id, retries=50)
if transcript:
    print("\nüìÑ Transcript Preview:\n", transcript[:500])
    print("\nüìÑ Transcript Chunks Preview:\n", transcript_list[:500])
else:
    print("‚ùó Could not retrieve transcript after multiple attempts.")

üéØ Available languages: ['en', 'en']
‚úÖ Found transcript in preferred language: en

üìÑ Transcript Preview:
 Hello Everyone, my name is Aarohi and welcome¬†
to my channel. So guys in my today's video,¬†¬† I'll show you how to create your own CH GPT clone¬†
using Langchain, streamlet and OpenAI API. First¬†¬† let me show you the demo of the app which we¬†
are going to build and then we'll dive into¬†¬† the details of how to create that app. This is¬†
my demo app and let's ask a question to this app¬†¬† which LLM are you using to generate responses?¬†
So here you can see it is utilizing GPT-3. So¬†¬† GPT3 is a large la

üìÑ Transcript Chunks Preview:
 [{'text': "Hello Everyone, my name is Aarohi and welcome\xa0\nto my channel. So guys in my today's video,\xa0\xa0", 'start': 0.84, 'duration': 4.08}, {'text': "I'll show you how to create your own CH GPT clone\xa0\nusing Langchain, streamlet and OpenAI API. First\xa0\xa0", 'start': 4.92, 'duration': 8.56}, {'text': "let me show you the

In [7]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=500)
chunks = text_splitter.create_documents([transcript])

# Check number of chunks
pprint(len(chunks))
pprint(chunks)

4
[Document(metadata={}, page_content="Hello Everyone, my name is Aarohi and welcome\xa0\nto my channel. So guys in my today's video,\xa0\xa0 I'll show you how to create your own CH GPT clone\xa0\nusing Langchain, streamlet and OpenAI API. First\xa0\xa0 let me show you the demo of the app which we\xa0\nare going to build and then we'll dive into\xa0\xa0 the details of how to create that app. This is\xa0\nmy demo app and let's ask a question to this app\xa0\xa0 which LLM are you using to generate responses?\xa0\nSo here you can see it is utilizing GPT-3. So\xa0\xa0 GPT3 is a large language model by OpenAI. The app\xa0\nwhich we are going to build today, In that app,\xa0\xa0 I have specified that I want to use GPT-3 LLM\xa0\nfor generating responses. Now let's ask the same\xa0\xa0 question to the ChatGPT. okay let's open ChatGPT.\xa0\nSo let's ask the same question to the ChatGPT. So the chatGPT app is using GPT-4 to\xa0\ngenerate responses and this is again\xa0\xa0 another LLM by OpenAI

In [8]:
final_combine_prompt = '''You are an expert summarizer reviewing segment-level summaries of a YouTube video transcript. 
                          Your task is to create a **final, cohesive summary** that represents the overall content of the video.
                          Please ensure the final summary is:
                          - Concise and logically structured
                          - Faithful to the main ideas discussed across the video
                          - Written in clear and professional language

                          Use bullet points to highlight key themes, and end with a brief concluding remark if appropriate.
                          Below are the partial summaries:
                          {text}
                          Final Consolidated Summary:
                       '''
final_combine_prompt_template = PromptTemplate(input_variables=['text'], template=final_combine_prompt)


chunks_prompt = '''You are analyzing a portion of a YouTube video transcript. Identify and summarize the **main ideas, insights, or recurring themes** in this chunk.
                   Requirements:
                   - Focus on important arguments, events, or opinions.
                   - Avoid redundancy.
                   - Use bullet points for clarity if needed.
                    Transcript Chunk: {text}
                    Summary:
                '''
map_prompt_template = PromptTemplate(input_variables=['text'], template=chunks_prompt)


In [10]:
# Load summarize chain with map-reduce strategy
summary_chain = load_summarize_chain(
    llm=llm,
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=final_combine_prompt_template,
    verbose=False
)

# Run summarization on text chunks
output = summary_chain.invoke(chunks)

# Print the final summary
pprint(output['output_text'])

('This video tutorial demonstrates how to build a ChatGPT clone using '
 'Langchain, Streamlit, and the OpenAI API, focusing on practical '
 'implementation and cost-effectiveness. Key aspects include:\n'
 '\n'
 '*   **Project Setup:** Creating a dedicated Anaconda environment with Python '
 '3.10, upgrading pip, and installing necessary packages from a '
 '`requirements.txt` file.\n'
 '*   **API Key Configuration:** Obtaining and securely storing an OpenAI API '
 "key in a `.env` file for authentication and access to OpenAI's language "
 'models.\n'
 '*   **Streamlit Application:** Building the user interface using Streamlit, '
 'including setting the title and displaying the chat history.\n'
 '*   **Language Model Integration:** Utilizing the `langchain` OpenAI module '
 'to interact with language models, specifically GPT-3.5 Turbo for cost '
 'efficiency.\n'
 '*   **Chat History Management:** Implementing a "messages" list within '
 "Streamlit's session state to store and display th

In [11]:
pprint(output['output_text'])

('This video tutorial demonstrates how to build a ChatGPT clone using '
 'Langchain, Streamlit, and the OpenAI API, focusing on practical '
 'implementation and cost-effectiveness. Key aspects include:\n'
 '\n'
 '*   **Project Setup:** Creating a dedicated Anaconda environment with Python '
 '3.10, upgrading pip, and installing necessary packages from a '
 '`requirements.txt` file.\n'
 '*   **API Key Configuration:** Obtaining and securely storing an OpenAI API '
 "key in a `.env` file for authentication and access to OpenAI's language "
 'models.\n'
 '*   **Streamlit Application:** Building the user interface using Streamlit, '
 'including setting the title and displaying the chat history.\n'
 '*   **Language Model Integration:** Utilizing the `langchain` OpenAI module '
 'to interact with language models, specifically GPT-3.5 Turbo for cost '
 'efficiency.\n'
 '*   **Chat History Management:** Implementing a "messages" list within '
 "Streamlit's session state to store and display th