## Extract Automatic Transcripts from YouTube Videos

### Imports

In [1]:
import polars as pl
from youtube_transcript_api import YouTubeTranscriptApi

### Functions

In [2]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """
    
    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

### Get Transcripts

In [3]:
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ qPN_XZcJf_s ┆ 2025-05-05T04:01:03Z ┆ Reinforcement Learning with Hu… │
│ DVGmsnxB2UQ ┆ 2025-04-14T04:00:27Z ┆ Reinforcement Learning with Ne… │
│ 9hbQieQh7-o ┆ 2025-04-07T04:00:17Z ┆ Reinforcement Learning with Ne… │
│ Z-T0iJEXiwM ┆ 2025-03-31T04:00:25Z ┆ Reinforcement Learning: Essent… │
│ _kstkMF-lQQ ┆ 2025-02-12T14:20:19Z ┆ StatQuest on DeepLearning.AI!!… │
└─────────────┴──────────────────────┴─────────────────────────────────┘


In [4]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"
    
    transcript_text_list.append(transcript_text)

CPU times: total: 10.2 s
Wall time: 4min 55s


In [5]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ transcript                   │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ qPN_XZcJf_s ┆ 2025-05-05T04:01:03Z ┆ Reinforcement Learning with  ┆ If you tell me what you like │
│             ┆                      ┆ Hu…                          ┆ a…                           │
│ DVGmsnxB2UQ ┆ 2025-04-14T04:00:27Z ┆ Reinforcement Learning with  ┆ if you make a guess and you  │
│             ┆                      ┆ Ne…                          ┆ ma…                          │
│ 9hbQieQh7-o ┆ 2025-04-07T04:00:17Z ┆ Reinforcement Learning with  ┆ When yo

### Export Data 

In [6]:
# write data to file
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')