In [None]:
# Install required packages
#!pip install -q yt-dlp openai-whisper ffmpeg-python
!pip install -q -U yt-dlp openai-whisper ffmpeg-python google-generativeai
# Install ffmpeg (needed for audio extraction)
!apt-get -y install ffmpeg

# Check GPU
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Using device: cpu


In [None]:
import os
import whisper
import google.generativeai as gen
from google.colab import userdata

# ----------------- Load Gemini API key from Secrets -----------------
api_key = userdata.get("GEMINI_API_KEY")
if not api_key:
    raise RuntimeError("Missing GEMINI_API_KEY in Colab Secrets. Add it in Colab Settings.")

gen.configure(api_key=api_key)

# ----------------- Whisper + Gemini Models -----------------
WHISPER_MODEL_NAME = "small"
GEMINI_MODEL = "gemini-flash-latest"  # this IS correct
# GEMINI_MODEL = "google/gemini-2.0-flash-exp:free"
print(f"Loading Whisper model '{WHISPER_MODEL_NAME}' on {device}...")
whisper_model = whisper.load_model(WHISPER_MODEL_NAME, device=device)
print("Whisper model loaded.")


Loading Whisper model 'small' on cpu...
Whisper model loaded.


In [None]:
import os
import subprocess
import tempfile
import yt_dlp

# ----------------- Download YouTube Video -----------------
def download_youtube(url, workdir):
    ydl_opts = {
        "outtmpl": os.path.join(workdir, "video.%(ext)s"),
        "format": "mp4/bestvideo[ext=mp4]+bestaudio/best/best",
        "quiet": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.extract_info(url, download=True)

    files = [f for f in os.listdir(workdir) if f.startswith("video.")]
    if not files:
        raise RuntimeError("Video download failed.")
    return os.path.join(workdir, files[0])


# ----------------- Get YouTube video duration -----------------
def get_video_duration(url):
    with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
        info = ydl.extract_info(url, download=False)
        return info.get("duration")  # seconds


# ----------------- Convert to MP3 using FFmpeg -----------------
def convert_to_audio(video_path, audio_path):
    cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "mp3", audio_path]
    res = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    if res.returncode != 0:
        raise RuntimeError("FFmpeg conversion failed.")


# ----------------- Whisper Transcription -----------------
def transcribe(audio_path):
    result = whisper_model.transcribe(audio_path, fp16=(device == "cuda"))
    return result.get("text", "").strip()


# ----------------- High level pipelines -----------------
def process_youtube(url, max_minutes=20):
    duration = get_video_duration(url)
    if duration and duration > max_minutes * 60:
        raise ValueError(
            f"Video too long ({duration//60} min). Limit = {max_minutes} min."
        )

    with tempfile.TemporaryDirectory() as tmp:
        print("Downloading video...")
        video = download_youtube(url, tmp)
        audio = os.path.join(tmp, "audio.mp3")
        convert_to_audio(video, audio)
        return transcribe(audio)


def process_uploaded_video(path):
    with tempfile.TemporaryDirectory() as tmp:
        print("Converting video to audio...")
        audio = os.path.join(tmp, "audio.mp3")
        convert_to_audio(path, audio)
        return transcribe(audio)


# ----------------- Gemini Summary -----------------
# def summarize_with_gemini(transcript):
#     instruction = (
#         "You will receive a transcript from a YouTube video.\n"
#         "Tasks:\n"
#         "1. Detect if the content is mainly music or song lyrics or any meeting or new etc....\n"
#         "   - Repeated chorus, rhyme patterns, no explanations.\n"
#         "2. If music/lyrics, reply EXACTLY and format it :\n"
#         "   'This content appears to be music or song lyrics, so I will not summarize it.'\n"
#         "3. Otherwise, summarize in points.\n"
#         "4. No direct transcript copying.\n"
#         "5. if its a meeting then MOM(moments of meeting)."
#     )

#     model = gen.GenerativeModel(
#         GEMINI_MODEL,
#         system_instruction=instruction
#     )

#     # Prevents overloading model with huge text
#     transcript = transcript[:9000]

#     response = model.generate_content(transcript)
#     return response.text.strip()
#--------------------------------------------------------------------
def summarize_with_gemini(transcript):
    """
    Uses Gemini to:
    - Detect if the transcript is music/lyrics -> return fixed message.
    - If it's a meeting -> return MoM (Minutes of Meeting).
    - Otherwise -> give a clean point-wise summary.
    """

    system_instruction = (
        "You will receive a transcript from a YouTube video or an uploaded video.\n"
        "Your tasks:\n"
        "1. Identify the type of content: music/lyrics, meeting, vlog, lecture, tutorial, etc.\n"
        "2. If the content is mainly music or song lyrics and just format the lyrics and also give the artist and the song name, respond EXACTLY:\n"
        "   'This content appears to be music or song lyrics, so I will not summarize it.'\n"
        "3. If it is a meeting, produce a structured MoM (Minutes of Meeting) including:\n"
        "   - Agenda\n"
        "   - Key discussion points\n"
        "   - Decisions taken\n"
        "   - Action items with responsible persons (if available)\n"
        "4. For any other non-music content, provide point-wise summary.\n"
        "5. Do NOT copy long transcript sections. Keep response under.\n"
        "imp: give the the ouput in clear form which looks good dont include # *  use - for points and main the clear structure use bold for important points,and text formaters."
    )

    # Limit huge transcripts to avoid token overflow
    transcript = transcript[:9000]

    # Create model with system instruction
    model = gen.GenerativeModel(
        GEMINI_MODEL,
        system_instruction=system_instruction
    )

    # Send only the user content â€“ NO role messages (avoids 400 error)
    response = model.generate_content(transcript)

    return response.text.strip()

In [None]:
from google.colab import files
import os

# ----------------- WRITE OUTPUT TO .MD FILE -----------------
def write_output_md(transcript, summary=None):
    """
    Creates an output.md file with clean formatting.
    Includes transcript and optional summary.
    """

    content = []
    content.append("# Transcript Output\n")
    content.append("\n## Transcript\n\n")
    content.append(transcript.strip())
    content.append("\n\n---\n")

    if summary:
        content.append("## Summary\n\n")
        content.append(summary.strip())
        content.append("\n\n---\n")

    # Join all lines
    final_text = "".join(content)

    with open("output.md", "w", encoding="utf-8") as f:
        f.write(final_text)

    print("\nSaved output to output.md\n")


# ----------------- USER MODE SELECTION -----------------

print("Choose an option:")
print("1 - Transcribe YouTube URL")
print("2 - Upload a video file")

choice = input("Enter 1 or 2: ").strip()

# ----------------- YOUTUBE MODE -----------------
if choice == "1":
    url = input("Enter YouTube URL: ").strip()
    if url:
        try:
            transcript = process_youtube(url)
            print("\n========== TRANSCRIPT ==========\n")
            print(transcript)
            print("\n================================\n")

            summary = None
            ask = input("Summarize with Gemini? (y/n): ").strip().lower()
            if ask == "y":
                summary = summarize_with_gemini(transcript)
                print("\n======== GEMINI SUMMARY ========\n")
                print(summary)
                print("\n================================\n")

            # WRITE TO MD
            write_output_md(transcript, summary)

        except Exception as e:
            print("Error:", e)
    else:
        print("No URL entered.")


# ----------------- UPLOAD MODE -----------------
elif choice == "2":
    print("Upload a video file...")
    uploaded = files.upload()

    if uploaded:
        filename = list(uploaded.keys())[0]
        print("Received:", filename)
        try:
            transcript = process_uploaded_video(filename)
            print("\n========== TRANSCRIPT ==========\n")
            print(transcript)
            print("\n================================\n")

            summary = None
            ask = input("Summarize with Gemini? (y/n): ").strip().lower()
            if ask == "y":
                summary = summarize_with_gemini(transcript)
                print("\n======== GEMINI SUMMARY ========\n")
                print(summary)
                print("\n================================\n")

            # WRITE TO MD
            write_output_md(transcript, summary)

        except Exception as e:
            print("Error:", e)
    else:
        print("No file uploaded.")

else:
    print("Invalid option.")
# https://youtu.be/B3Z4XGAxJB0?si=o0MIpMKmU5LEsw7j

Choose an option:
1 - Transcribe YouTube URL
2 - Upload a video file
Enter 1 or 2: 1
Enter YouTube URL: https://youtu.be/B3Z4XGAxJB0?si=o0MIpMKmU5LEsw7j




Downloading video...






Just one more to describe, one tear drop from my eye You better save the fall, the middle of the night When things aren't back in white, and it's true but all Remember 24 And when I'm back in Chicago, I see it Another vision of me, I was in it I wave goodbye to the end of beginning This song has started now, and you're just finding out Now isn't there the laugh, a major sacrifice But clueless at the time, into Caroline Just trust me, you'll be fine And when I'm back in Chicago, I feel it Another vision of me, I was in it I wave goodbye to the end of beginning You take the man out of the city, not the city of the man And when I'm back in Chicago, I feel it Another vision of me, I was in it I wave goodbye to the end of beginning


Summarize with Gemini? (y/n): y


This content appears to be music or song lyrics, so I will not summarize it.

**Song:** 24
**Artist:** Noah Kahan

- Just one more to describe, one tear drop from my eye
- You better save the fall, the middle of the night
- W

In [None]:
files.download("output.md")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>