<a href="https://colab.research.google.com/github/UdaraChamidu/Full-Stack-Data-Science-Project/blob/main/pull_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
pip install youtube-transcript-api==0.6.2

Collecting youtube-transcript-api==0.6.2
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
  Attempting uninstall: youtube-transcript-api
    Found existing installation: youtube-transcript-api 1.2.2
    Uninstalling youtube-transcript-api-1.2.2:
      Successfully uninstalled youtube-transcript-api-1.2.2
Successfully installed youtube-transcript-api-0.6.2


In [1]:
import requests # api calls
import json # CONVERT TO JSON
import polars as pl # faster version of pandas
from google.colab import userdata # load secreats

from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
import os
my_key = userdata.get('my_key')

In [3]:
def getVideoRecords(response: requests.models.Response) -> list:
    """
        Function to extract YouTube video data from GET request response
    """

    video_record_list = []

    for raw_item in json.loads(response.text)['items']:

        # only execute for youtube videos
        if raw_item['id']['kind'] != "youtube#video":
            continue

        video_record = {}
        video_record['video_id'] = raw_item['id']['videoId']
        video_record['datetime'] = raw_item['snippet']['publishedAt']
        video_record['title'] = raw_item['snippet']['title']
        #video_record['transcript'] = raw_item['snippet']['transcript']
        #video_record['description'] = raw_item['snippet']['description']

        video_record_list.append(video_record)

    return video_record_list

In [19]:
# define channel ID
channel_id = 'UC4XHOuACFrl25gfW78vS-GA'
# https://www.youtube.com/@SuprOrdinary  recomended by chat GPT
# define url for API

url = 'https://www.googleapis.com/youtube/v3/search'

# initialize page token
page_token = None

# intialize list to store video data
video_record_list = []

In [20]:
%%time
# extract video data across multiple search result pages
while page_token != 0:
    # define parameters for API call
    # max result is reduced to 2. because it exceed the daily limit.
    params = {"key": my_key, 'channelId': channel_id, 'part': ["snippet","id"], 'order': "date", 'maxResults':2, 'pageToken': page_token}
    # make get request
    response = requests.get(url, params=params)

    # Check if the request was successful and the response contains 'items'
    if response.status_code == 200 and 'items' in json.loads(response.text):
        # append video records to list
        video_record_list += getVideoRecords(response)

        try:
            # grab next page token
            page_token = json.loads(response.text)['nextPageToken']
        except KeyError:
            # if no next page token kill while loop
            page_token = 0
    else:
        # If the request was not successful or 'items' is missing, stop the loop
        print(f"API request failed with status code {response.status_code} or no items found.")
        print(response.text) # Print the response text for debugging
        page_token = 0

API request failed with status code 403 or no items found.
{
  "error": {
    "code": 403,
    "message": "The request cannot be completed because you have exceeded your \u003ca href=\"/youtube/v3/getting-started#quota\"\u003equota\u003c/a\u003e.",
    "errors": [
      {
        "message": "The request cannot be completed because you have exceeded your \u003ca href=\"/youtube/v3/getting-started#quota\"\u003equota\u003c/a\u003e.",
        "domain": "youtube.quota",
        "reason": "quotaExceeded"
      }
    ]
  }
}

CPU times: user 7.4 ms, sys: 78 µs, total: 7.48 ms
Wall time: 79.2 ms


In [8]:
# write data to file
os.makedirs('data')
pl.DataFrame(video_record_list).write_parquet('data/video-ids.parquet')
pl.DataFrame(video_record_list).write_csv('data/video-ids.csv')

In [7]:
video_record_list

[{'video_id': 'y3_0d-_rAgY',
  'datetime': '2025-08-31T11:01:35Z',
  'title': 'Fine-Tuning GPT-4o: Build Your Own Custom AI Model (Step-by-Step Guide)'},
 {'video_id': 'b96tn09Lz3I',
  'datetime': '2025-08-29T09:30:18Z',
  'title': 'Langflow Tutorial | Build RAG Apps &amp; Agentic AI Without Coding'},
 {'video_id': '9zsZZ2HbnOI',
  'datetime': '2025-08-27T02:15:01Z',
  'title': 'Build Your First AI Chatbot in 5 Minutes | Langflow No-Code Tutorial'},
 {'video_id': 'JgWD6Tin9eQ',
  'datetime': '2025-08-24T07:30:14Z',
  'title': 'Agentic AI vs Traditional AI'},
 {'video_id': 'GlLlggfoM5M',
  'datetime': '2025-08-22T10:30:10Z',
  'title': 'Stock Market Analysis with Multi-Agent AI Agents | Azure Agent Foundry End-to-End Project'},
 {'video_id': 'Nu14lrjDDTM',
  'datetime': '2025-08-20T02:15:02Z',
  'title': 'Azure Agentic AI SDK Tutorial: Create &amp; Manage Agents'},
 {'video_id': '-gaFEfFJYLc',
  'datetime': '2025-08-17T09:15:01Z',
  'title': 'AI Agent in Just 5 Minutes with Azure 🤯'},
 

# Get the Transcript

In [9]:
import polars as pl

In [10]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """

    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

In [11]:
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ y3_0d-_rAgY ┆ 2025-08-31T11:01:35Z ┆ Fine-Tuning GPT-4o: Build Your… │
│ b96tn09Lz3I ┆ 2025-08-29T09:30:18Z ┆ Langflow Tutorial | Build RAG … │
│ 9zsZZ2HbnOI ┆ 2025-08-27T02:15:01Z ┆ Build Your First AI Chatbot in… │
│ JgWD6Tin9eQ ┆ 2025-08-24T07:30:14Z ┆ Agentic AI vs Traditional AI    │
│ GlLlggfoM5M ┆ 2025-08-22T10:30:10Z ┆ Stock Market Analysis with Mul… │
└─────────────┴──────────────────────┴─────────────────────────────────┘


In [13]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"

    transcript_text_list.append(transcript_text)

CPU times: user 896 ms, sys: 114 ms, total: 1.01 s
Wall time: 18.6 s


In [14]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.tail())

shape: (5, 4)
┌─────────────┬──────────────────────┬─────────────────────────────────┬────────────┐
│ video_id    ┆ datetime             ┆ title                           ┆ transcript │
│ ---         ┆ ---                  ┆ ---                             ┆ ---        │
│ str         ┆ str                  ┆ str                             ┆ str        │
╞═════════════╪══════════════════════╪═════════════════════════════════╪════════════╡
│ zA5SOU7qVGs ┆ 2025-07-12T05:30:27Z ┆ 03- AutoGen + Human-in-the-Loo… ┆ n/a        │
│ Edt6sgKFgi4 ┆ 2025-07-12T01:52:29Z ┆ 02- AutoGen Agent Conversation… ┆ n/a        │
│ tvrGwdV-lI4 ┆ 2025-07-11T19:37:29Z ┆ 01- Getting Started with AutoG… ┆ n/a        │
│ 1eVza12aN80 ┆ 2025-07-11T09:30:00Z ┆ Build a GenAI-Powered Text to … ┆ n/a        │
│ LUO6rOGnGHs ┆ 2025-06-21T08:49:04Z ┆ Agentic AI Is Replacing Enterp… ┆ n/a        │
└─────────────┴──────────────────────┴─────────────────────────────────┴────────────┘


In [15]:
df

video_id,datetime,title,transcript
str,str,str,str
"""y3_0d-_rAgY""","""2025-08-31T11:01:35Z""","""Fine-Tuning GPT-4o: Build Your…","""n/a"""
"""b96tn09Lz3I""","""2025-08-29T09:30:18Z""","""Langflow Tutorial | Build RAG …","""n/a"""
"""9zsZZ2HbnOI""","""2025-08-27T02:15:01Z""","""Build Your First AI Chatbot in…","""n/a"""
"""JgWD6Tin9eQ""","""2025-08-24T07:30:14Z""","""Agentic AI vs Traditional AI""","""n/a"""
"""GlLlggfoM5M""","""2025-08-22T10:30:10Z""","""Stock Market Analysis with Mul…","""n/a"""
…,…,…,…
"""zA5SOU7qVGs""","""2025-07-12T05:30:27Z""","""03- AutoGen + Human-in-the-Loo…","""n/a"""
"""Edt6sgKFgi4""","""2025-07-12T01:52:29Z""","""02- AutoGen Agent Conversation…","""n/a"""
"""tvrGwdV-lI4""","""2025-07-11T19:37:29Z""","""01- Getting Started with AutoG…","""n/a"""
"""1eVza12aN80""","""2025-07-11T09:30:00Z""","""Build a GenAI-Powered Text to …","""n/a"""


In [None]:
# write data to file
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')