<a href="https://colab.research.google.com/github/NormLorenz/ai-llm-youtube-transcription-utility/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Convert the YouTube Transaction Utility to Colab
## Run in Google Colab in anticipation of using AI to:
* correct noun casing
* correct sentence ending - either a period, a question mark or a exclamation mark
* correct word spelling

In [None]:
# install and run a code formatter
# !pip install jupyter-black
# !pip uninstall jupyter-black
# %load_ext jupyter_black

In [None]:
# installs

!pip install openai python-dotenv google-generativeai anthropic youtube_transcript_api

In [None]:
# imports

import os
import io
import sys
import json
import requests
from openai import OpenAI
import google.generativeai
import anthropic
from IPython.display import Markdown, display, update_display
import gradio as gr
import subprocess
from google.colab import userdata
from youtube_transcript_api import YouTubeTranscriptApi
import re
import pprint
from typing import List, Dict, Union, Tuple

In [None]:
# keys

openai_api_key = userdata.get("OPENAI_API_KEY")
claude_api_key = userdata.get("ANTHROPIC_API_KEY")
google_api_key = userdata.get("GOOGLE_API_KEY")
hugging_face_token = userdata.get("HF_TOKEN")

In [None]:
# initialize

openai = OpenAI(api_key=openai_api_key)
claude = anthropic.Anthropic(api_key=claude_api_key)
google.generativeai.configure(api_key=google_api_key)

MODELS = [
    "OPENAI (gpt-4)",
    "OPENAI (gpt-4o-mini)",
    "ANTHROPIC (claude-3-5-haiku-latest)",
    "GOOGLE (gemini-2.5-flash-lite)",
]

In [None]:
def get_transcript(url: str) -> List[Dict[str, str]]:
    """Get the transcript from a YouTube video."""

    # Get the video id from the url
    video_id = url.split('v=')[1]
    video_id = video_id.split('&')[0]

    # Fetch the transcript
    step_one = YouTubeTranscriptApi().fetch(video_id, languages=['de'])

    # Convert it to raw data
    step_two = step_one.to_raw_data()

    # Build a dictionary expanding duplicates and striping duration
    step_three = build_dictionary_expanding_duplicates(step_two)

    # Build dictionary without duplicates
    step_four = build_dictionary_without_duplicates(step_three)

    # Build dictionary with time as a string
    step_five = build_dictionary_with_time_string(step_four)

    return step_five

In [None]:
def split_strings(input: str, separators: List[str]) -> List[str]:
    """Split a string into multiple parts using a separator."""
    combined_array: List[str] = []

    # Create a regular expression pattern to capture separators
    pattern: str = f"({'|'.join(map(re.escape, separators))})"

    # Split the input string using the pattern and keep the separators
    result: List[str] = re.split(pattern, input)

    # Recombine adjacent strings
    i: int = 0
    while i < len(result) - 1:
        combined_element: str = result[i] + result[i + 1]
        combined_array.append(combined_element)
        i += 2

    return combined_array

In [None]:
def build_dictionary_expanding_duplicates(dictionary: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Expand the dictionary by duplicating rows that have duplicated text."""
    result: List[Dict[str, str]] = []

    for item in dictionary:
        strings: List[str] = split_strings(item['text'], ['.', '?', '!'])
        if len(strings) == 0:
            result.append({'text': item['text'].capitalize(), 'start': item['start']})
        else:
            for string in strings:
                result.append({'text': string.capitalize(), 'start': item['start']})

    return result

In [None]:
def build_dictionary_without_duplicates(dictionary):
    """Build dictionary without duplicates."""
    result = []

    for old_item in dictionary:
        if not any(new_item['text'] == old_item['text'] for new_item in result):
            result.append(old_item)

    return result

In [None]:
def seconds_to_time_string(time_string):
    """Convert a decimal to a string."""

    # Calculate hours, minutes, and remaining seconds
    hours = int(time_string // 3600)
    minutes = int((time_string % 3600) // 60)
    seconds = int(time_string % 60)

    # Format the time string
    if hours == 0:
        result = f"{minutes:2}:{seconds:02}"
    else:
        result = f"{hours:2}:{minutes:02}:{seconds:02}"

    return result

In [None]:
def build_dictionary_with_time_string(dictionary):
    """Build a dictionary with the time strings."""
    result = []

    for item in dictionary:
        new_time = seconds_to_time_string(item['start'])
        result.append({'text': item['text'], 'start': new_time})

    return result

In [None]:
def system_message() -> str:
    """Create a system message."""
    system_message = "You are a German language tutor. Your task is to review and correct a Python list of dictionaries that contain "
    system_message += "a time stamp and a German sentence. The sentence may have incorrect casing for German nouns and may have incorrect "
    system_message += "or missing period, question mark or exclamation point. Also please correct any casing at the start of the sentence and "
    system_message += "any misspelled words. The expected output format should be a pure Python list of dictionaries with no extra explanations something like: "
    system_message += '```python[{"start": "00:01", "text": "Ich habe ein Buch gelesen."}, {"start": "00:05", "text": "Das ist ein Haus."}]```'
    return system_message

In [None]:
def user_prompt(items: List[Dict[str, str]]):
    """Create a user prompt."""
    user_prompt = "Please review and correct the following: \n"
    user_prompt += str(items)
    return user_prompt

In [None]:
def fetch_openai(model: str, items: List[Dict[str, str]])-> List[Dict[str, str]]:
    """Fetch an OpenAI response."""
    messages = [
        {"role": "system", "content": system_message()},
        {"role": "user", "content": user_prompt(items)},
    ]
    raw = openai.chat.completions.create(
        model=model, messages=messages
    )
    stripped = raw.choices[0].message["content"].replace("```python\n", "").replace("```", "").replace("\n", "")
    display(stripped)
    return json.loads(stripped)

In [None]:
def fetch_anthropic(model: str, items: List[Dict[str, str]])-> List[Dict[str, str]]:
    """Fetch an Anthropic response."""
    raw = claude.messages.create(
        model=model,
        max_tokens=2000,
        system=system_message(),
        messages=[{"role": "user", "content": user_prompt(items)}],
    )
    stripped: str = raw.content[0].text.replace("```python\n", "").replace("```", "").replace("\n", "")
    display(stripped)
    return json.loads(stripped)

In [None]:
def fetch_google(model: str, items: List[Dict[str, str]])-> List[Dict[str, str]]:
    """Fetch a Google response."""
    raw = google.generativeai.GenerativeModel(
        model_name=model, system_instruction=system_message()
    )
    response: str = raw.generate_content(user_prompt(items), stream=False).text
    stripped: str = response.replace("```python\n", "").replace("```", "").replace("\n", "")
    display(stripped)
    return json.loads(stripped)

In [None]:
def fetch_model(provider: str, model: str, items: List[Dict[str, str]])-> List[Dict[str, str]]:
  """Fetch a transcript."""
  if provider == "OPENAI":
    return fetch_openai(model, items)
  elif provider == "ANTHROPIC":
    return fetch_anthropic(model, items)
  elif provider == "GOOGLE":
    return fetch_google(model, items)
  else:
    return []

In [None]:
"""Gradio UI for the YouTube Transcript Utility."""

selected_model: str = ""
selected_provider: str = ""

css: str = """
  #my_html_box { border: 2px solid blue; padding: 10px; }
"""

js: str = """
function() {
    const table = document.querySelector('table');
    const range = document.createRange();
    range.selectNode(table);
    window.getSelection().removeAllRanges();
    window.getSelection().addRange(range);
    document.execCommand('copy');
    alert('Table copied to clipboard!');
    window.getSelection().removeAllRanges();
    return [];
    }
"""

sample_url: str = (
    "https://www.youtube.com/watch?v=L6HnBjnkKmM&list=PLCCi8icw2DAO1qZGH7heUyTB8bJP1ICZ5&index=28"
)

def clear_fields() -> Tuple[None, str, str]:
    """Clear all fields."""
    return (None, "", "")

def mock_original_data(url: str) -> List[Dict[str, str]]:
    """Mock the original transcript data."""
    return [
        {"start": "00:01", "text": "ich habe ein buch gelesen"},
        {"start": "00:05", "text": "dass ist ein haus"},
        {"start": "00:10", "text": "wie geht es dir"},
        {"start": "00:10", "text": "seien sie vorsicht"},
    ]

def mock_corrected_data(url: str) -> List[Dict[str, str]]:
    """Mock the corrected transcript data."""
    return [
        {"start": "00:01", "text": "Ich habe ein Buch gelesen."},
        {"start": "00:05", "text": "Das ist ein Haus."},
        {"start": "00:10", "text": "Wie geht es dir?"},
        {"start": "00:10", "text": "Seien Sie vorsichtig!"},
    ]

def build_table(items: List[Dict[str, str]]) -> str:
    """Build an HTML table from the list of dictionaries."""
    html: str = "<table width=100%>\n"
    html += "  <tbody>\n"
    for item in items:
        html += f"    <tr><td>{item['start']}</td><td>{item['text']}</td></tr>\n"
    html += "  </tbody>\n"
    html += "</table>"
    return html

def build_text(items: List[Dict[str, str]]) -> str:
    """Build a text string from the list of dictionaries."""
    return pprint.pformat(items, indent=4)

def build_model(model: str) -> None:
    """Display the selected model."""
    global selected_provider
    selected_provider = model.split(" ")[0]
    global selected_model
    selected_model = model.split(" ")[1].replace("(", "").replace(")", "")

def fetch_transcript(url: str, mock: bool, model: str) -> Tuple[str, str]:
    """Fetch the transcript."""
    if not url:
        return ("", "<p style='color: orange;'>The YouTube Video URL field can't be empty!</p>")
    else:
        transcript_original: List[Dict[str, str]] = (mock_original_data(url) if mock else get_transcript(url))
        transcript_corrected: List[Dict[str, str]] = (mock_corrected_data(url) if mock else fetch_model(selected_provider, selected_model, transcript_original))
        list_original: str = build_text(transcript_original)
        html_corrected: str = build_table(transcript_corrected)
        return list_original, html_corrected

with gr.Blocks(css=css) as ui:

    gr.Markdown("## YouTube Transcript Utility")
    gr.Markdown("### Creates a HTML table that can be copied and pasted into a Windows OneNote application")

    with gr.Row():
        url = gr.Textbox(label="YouTube Video URL:", value=sample_url)
    with gr.Row():
        model = gr.Dropdown(MODELS, label="AI Model Name:", value=MODELS[3])
    with gr.Row():
        mock = gr.Checkbox(label="Use Mock Data", value=True)
    with gr.Row():
        fetch = gr.Button("Fetch", variant="primary")
        copy = gr.Button("Copy")
        clear = gr.Button("Clear")
    with gr.Row():
        text = gr.TextArea(label="Original Transcript")
        html = gr.HTML(elem_id="my_html_box")

    fetch.click(fetch_transcript, inputs=[url, mock, model], outputs=[text, html])
    copy.click(fn=None, inputs=[], outputs=[], js=js)
    clear.click(clear_fields, inputs=[], outputs=[url, text, html])
    model.change(build_model, inputs=[model], outputs=[])

    # trigger manually on launch
    ui.load(build_model, [model], [])


ui.launch(inbrowser=True, debug=True)