# 1. Setup

In [1]:
%%capture
!pip install -q yt-dlp pydub ffmpeg-python openai-whisper tqdm

In [2]:
import os
import csv
import logging

logging.basicConfig(
    filename="pipeline_errors.log",
    level=logging.ERROR,
    format="%(asctime)s\t%(levelname)s\t%(message)s"
)

csv_path = "/kaggle/input/small-andrew-tate/fetched.csv"

entries = []
with open(csv_path, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        vid = row.get("videoId")
        url = row.get("url")
        if vid and url:
            entries.append({"id": vid, "url": url})


some_entries = entries[:]

some_entries

[{'id': 'gEe4XhIiUDI', 'url': 'https://www.youtube.com/watch?v=gEe4XhIiUDI'},
 {'id': 'TgeE6Cr02nc', 'url': 'https://www.youtube.com/watch?v=TgeE6Cr02nc'},
 {'id': 'sjw-lw-ZT4g', 'url': 'https://www.youtube.com/watch?v=sjw-lw-ZT4g'},
 {'id': 'YsL9FdQwOsU', 'url': 'https://www.youtube.com/watch?v=YsL9FdQwOsU'},
 {'id': '3ifgUkVOKaE', 'url': 'https://www.youtube.com/watch?v=3ifgUkVOKaE'},
 {'id': 'c0ztBJKmxKI', 'url': 'https://www.youtube.com/watch?v=c0ztBJKmxKI'},
 {'id': 'kCyAZ6wYg8w', 'url': 'https://www.youtube.com/watch?v=kCyAZ6wYg8w'},
 {'id': 'oT2WEG72iFY', 'url': 'https://www.youtube.com/watch?v=oT2WEG72iFY'},
 {'id': '5Y5d3EvwRvM', 'url': 'https://www.youtube.com/watch?v=5Y5d3EvwRvM'},
 {'id': '_M_9tkThiPU', 'url': 'https://www.youtube.com/watch?v=_M_9tkThiPU'}]

In [3]:
from yt_dlp import YoutubeDL
from pydub import AudioSegment
from pydub.utils import make_chunks
import whisper
from tqdm import tqdm

def download_m4a(url, output_dir="audio"):
    os.makedirs(output_dir, exist_ok=True)
    opts = {
        "format": "bestaudio/best",
        "outtmpl": os.path.join(output_dir, "audio.%(ext)s"),
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "m4a"
        }],
        "quiet": False,
        "no_warnings": True
    }
    with YoutubeDL(opts) as ydl:
        ydl.download([url])
    return os.path.join(output_dir, "audio.m4a")

def convert_to_wav(m4a_path):
    wav_path = os.path.splitext(m4a_path)[0] + ".wav"
    AudioSegment.from_file(m4a_path).export(wav_path, format="wav")
    return wav_path

def split_audio(wav_path, chunk_ms=60000):
    audio = AudioSegment.from_wav(wav_path)
    chunks = make_chunks(audio, chunk_ms)
    base = os.path.dirname(wav_path)
    paths = []
    for i, c in enumerate(chunks):
        p = os.path.join(base, f"chunk_{i}.wav")
        c.export(p, format="wav")
        paths.append(p)
    return paths

def cleanup_files(*paths):
    for p in paths:
        try:
            if os.path.exists(p):
                os.remove(p)
        except Exception as e:
            logging.error(f"CLEANUP_ERROR\t{p}\t{e}")


# 3. Pipeline

In [4]:
model = whisper.load_model("base")

100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 30.1MiB/s]


In [5]:
for entry in tqdm(some_entries, desc="Videos"):
    vid, url = entry["id"], entry["url"]
    m4a_path = None
    wav_path = None
    chunks = []

    try:
        m4a_path = download_m4a(url)
        wav_path = convert_to_wav(m4a_path)
        chunks = split_audio(wav_path, chunk_ms=60000)
        
        # Transcribe
        texts = []
        for chunk in tqdm(chunks, desc=f"Transcribing {vid}", leave=False):
            res = model.transcribe(chunk)
            texts.append(res["text"])
        
        # Save transcript
        full_text = " ".join(texts)
        out_txt = f"{vid}.txt"
        with open(out_txt, "w", encoding="utf-8") as f:
            f.write(full_text)
        
        print(f"✓ Completed {vid}")
        
    except Exception as e:
        logging.error(f"{vid}\t{url}\tERROR\t{e}")
    
    finally:
        # Clean up all files regardless of success or failure
        if m4a_path and os.path.exists(m4a_path):
            os.remove(m4a_path)
        if wav_path and os.path.exists(wav_path):
            os.remove(wav_path)
        for chunk in chunks:
            if os.path.exists(chunk):
                os.remove(chunk)

print("✅ Done! Check pipeline_errors.log for any errors.")

Videos:   0%|          | 0/10 [00:00<?, ?it/s]

[youtube] Extracting URL: https://www.youtube.com/watch?v=gEe4XhIiUDI
[youtube] gEe4XhIiUDI: Downloading webpage
[youtube] gEe4XhIiUDI: Downloading tv client config
[youtube] gEe4XhIiUDI: Downloading player 461f4c95-main
[youtube] gEe4XhIiUDI: Downloading tv player API JSON
[youtube] gEe4XhIiUDI: Downloading ios player API JSON
[youtube] gEe4XhIiUDI: Downloading m3u8 information
[info] gEe4XhIiUDI: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   70.11MiB in 00:00:05 at 12.88MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)











































































Transcribing gEe4XhIiUDI: 100%|██████████| 72/72 [17:08<00:00,  9.06s/it][A
                                                                         [A

✓ Completed gEe4XhIiUDI


Videos:  10%|█         | 1/10 [18:55<2:50:23, 1135.96s/it]

[youtube] Extracting URL: https://www.youtube.com/watch?v=TgeE6Cr02nc
[youtube] TgeE6Cr02nc: Downloading webpage
[youtube] TgeE6Cr02nc: Downloading tv client config
[youtube] TgeE6Cr02nc: Downloading tv player API JSON
[youtube] TgeE6Cr02nc: Downloading ios player API JSON
[youtube] TgeE6Cr02nc: Downloading m3u8 information
[info] TgeE6Cr02nc: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of  386.83MiB in 00:00:27 at 14.30MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)


Videos:  20%|██        | 2/10 [28:38<1:48:04, 810.57s/it] 

[youtube] Extracting URL: https://www.youtube.com/watch?v=sjw-lw-ZT4g
[youtube] sjw-lw-ZT4g: Downloading webpage
[youtube] sjw-lw-ZT4g: Downloading tv client config
[youtube] sjw-lw-ZT4g: Downloading tv player API JSON
[youtube] sjw-lw-ZT4g: Downloading ios player API JSON
[youtube] sjw-lw-ZT4g: Downloading m3u8 information
[info] sjw-lw-ZT4g: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   45.91MiB in 00:00:02 at 19.56MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)





















































Transcribing sjw-lw-ZT4g: 100%|██████████| 50/50 [11:22<00:00,  9.82s/it][A
                                                                         [A

✓ Completed sjw-lw-ZT4g


Videos:  30%|███       | 3/10 [41:34<1:32:41, 794.52s/it]

[youtube] Extracting URL: https://www.youtube.com/watch?v=YsL9FdQwOsU
[youtube] YsL9FdQwOsU: Downloading webpage
[youtube] YsL9FdQwOsU: Downloading tv client config
[youtube] YsL9FdQwOsU: Downloading tv player API JSON
[youtube] YsL9FdQwOsU: Downloading ios player API JSON
[youtube] YsL9FdQwOsU: Downloading m3u8 information
[info] Testing format 234
[info] YsL9FdQwOsU: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 659
[download] Destination: audio/audio.mp4
[download] 100% of   51.32MiB in 00:00:23 at 2.17MiB/s                   
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.mp4 (pass -k to keep)


























































Transcribing YsL9FdQwOsU: 100%|██████████| 55/55 [13:45<00:00, 15.56s/it][A
                                                                         [A

✓ Completed YsL9FdQwOsU


Videos:  40%|████      | 4/10 [56:00<1:22:17, 822.93s/it]

[youtube] Extracting URL: https://www.youtube.com/watch?v=3ifgUkVOKaE
[youtube] 3ifgUkVOKaE: Downloading webpage
[youtube] 3ifgUkVOKaE: Downloading tv client config
[youtube] 3ifgUkVOKaE: Downloading tv player API JSON
[youtube] 3ifgUkVOKaE: Downloading ios player API JSON
[youtube] 3ifgUkVOKaE: Downloading m3u8 information
[info] 3ifgUkVOKaE: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   17.53MiB in 00:00:01 at 13.70MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)
























Transcribing 3ifgUkVOKaE: 100%|██████████| 21/21 [04:24<00:00, 11.19s/it][A
Videos:  50%|█████     | 5/10 [1:01:03<52:56, 635.28s/it]

✓ Completed 3ifgUkVOKaE
[youtube] Extracting URL: https://www.youtube.com/watch?v=c0ztBJKmxKI
[youtube] c0ztBJKmxKI: Downloading webpage
[youtube] c0ztBJKmxKI: Downloading tv client config
[youtube] c0ztBJKmxKI: Downloading tv player API JSON
[youtube] c0ztBJKmxKI: Downloading ios player API JSON
[youtube] c0ztBJKmxKI: Downloading m3u8 information
[info] c0ztBJKmxKI: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   28.40MiB in 00:00:01 at 16.01MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)

































Transcribing c0ztBJKmxKI: 100%|██████████| 30/30 [07:30<00:00, 13.02s/it][A
Videos:  60%|██████    | 6/10 [1:09:21<39:15, 588.85s/it]

✓ Completed c0ztBJKmxKI
[youtube] Extracting URL: https://www.youtube.com/watch?v=kCyAZ6wYg8w
[youtube] kCyAZ6wYg8w: Downloading webpage
[youtube] kCyAZ6wYg8w: Downloading tv client config
[youtube] kCyAZ6wYg8w: Downloading tv player API JSON
[youtube] kCyAZ6wYg8w: Downloading ios player API JSON
[youtube] kCyAZ6wYg8w: Downloading m3u8 information
[info] kCyAZ6wYg8w: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   70.41MiB in 00:00:00 at 107.39MiB/s 
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)











































































Transcribing kCyAZ6wYg8w: 100%|██████████| 72/72 [16:51<00:00, 10.03s/it][A
                                                                         [A

✓ Completed kCyAZ6wYg8w


Videos:  70%|███████   | 7/10 [1:28:00<38:06, 762.15s/it]

[youtube] Extracting URL: https://www.youtube.com/watch?v=oT2WEG72iFY
[youtube] oT2WEG72iFY: Downloading webpage
[youtube] oT2WEG72iFY: Downloading tv client config
[youtube] oT2WEG72iFY: Downloading tv player API JSON
[youtube] oT2WEG72iFY: Downloading ios player API JSON
[youtube] oT2WEG72iFY: Downloading m3u8 information
[info] oT2WEG72iFY: Downloading 1 format(s): 251-4
[download] Destination: audio/audio.webm
[download] 100% of   23.13MiB in 00:00:02 at 8.98MiB/s   
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)


























Transcribing oT2WEG72iFY: 100%|██████████| 23/23 [05:35<00:00, 14.81s/it][A
Videos:  80%|████████  | 8/10 [1:34:22<21:21, 640.95s/it]

✓ Completed oT2WEG72iFY
[youtube] Extracting URL: https://www.youtube.com/watch?v=5Y5d3EvwRvM
[youtube] 5Y5d3EvwRvM: Downloading webpage
[youtube] 5Y5d3EvwRvM: Downloading tv client config
[youtube] 5Y5d3EvwRvM: Downloading tv player API JSON
[youtube] 5Y5d3EvwRvM: Downloading ios player API JSON
[youtube] 5Y5d3EvwRvM: Downloading m3u8 information
[info] 5Y5d3EvwRvM: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   50.67MiB in 00:00:03 at 16.43MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)






























































Transcribing 5Y5d3EvwRvM: 100%|██████████| 59/59 [14:50<00:00, 12.92s/it][A
                                                                         [A

✓ Completed 5Y5d3EvwRvM


Videos:  90%|█████████ | 9/10 [1:50:44<12:27, 747.52s/it]

[youtube] Extracting URL: https://www.youtube.com/watch?v=_M_9tkThiPU
[youtube] _M_9tkThiPU: Downloading webpage
[youtube] _M_9tkThiPU: Downloading tv client config
[youtube] _M_9tkThiPU: Downloading tv player API JSON
[youtube] _M_9tkThiPU: Downloading ios player API JSON
[youtube] _M_9tkThiPU: Downloading m3u8 information
[info] _M_9tkThiPU: Downloading 1 format(s): 251
[download] Destination: audio/audio.webm
[download] 100% of   19.13MiB in 00:00:00 at 28.02MiB/s  
[ExtractAudio] Destination: audio/audio.m4a
Deleting original file audio/audio.webm (pass -k to keep)
























Transcribing _M_9tkThiPU: 100%|██████████| 21/21 [04:27<00:00, 10.83s/it][A
Videos: 100%|██████████| 10/10 [1:55:55<00:00, 695.54s/it]

✓ Completed _M_9tkThiPU
✅ Done! Check pipeline_errors.log for any errors.





In [6]:
print("Hello World!")

Hello World!
