<a href="https://colab.research.google.com/github/UdaraChamidu/Full-Stack-Data-Science-Project/blob/main/youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install youtube-transcript-api



In [16]:
import requests # api calls
import json # CONVERT TO JSON
import polars as pl # faster version of pandas
from google.colab import userdata # load secreats

from youtube_transcript_api import YouTubeTranscriptApi

# Extract Process

In [17]:
my_key = userdata.get('my_key')

# define channel ID
channel_id = 'UCa9gErQ9AE5jT2DZLjXBIdA'

# define url for API
url = 'https://www.googleapis.com/youtube/v3/search'

# initialize page token
# this need to search the results in different different pages.
# (search result is over one time. that time we need to go to the next page to see the result more)
page_token = None

# intialize list to store video data
video_record_list = []

In [18]:
def getVideoRecords(response: requests.models.Response) -> list:
    """
        Function to extract YouTube video data from GET request response
    """

    video_record_list = []

    for raw_item in json.loads(response.text)['items']:

        # only execute for youtube videos
        if raw_item['id']['kind'] != "youtube#video":
            continue

        video_record = {}
        video_record['video_id'] = raw_item['id']['videoId']
        video_record['datetime'] = raw_item['snippet']['publishedAt']
        video_record['title'] = raw_item['snippet']['title']

        video_record_list.append(video_record)

    return video_record_list

In [19]:
%%time
# extract video data across multiple search result pages
while page_token != 0:
    # define parameters for API call
    params = {"key": my_key, 'channelId': channel_id, 'part': ["snippet","id"], 'order': "date", 'maxResults':100, 'pageToken': page_token}

    # make get request
    response = requests.get(url, params=params)

    # append video records to list
    video_record_list += getVideoRecords(response)

    try:
        # grab next page token
        page_token = json.loads(response.text)['nextPageToken']
    except:
        # if no next page token kill while loop
        page_token = 0

CPU times: user 36.4 ms, sys: 4.47 ms, total: 40.8 ms
Wall time: 1.15 s


In [20]:
# store data in data frame

df = pl.DataFrame(video_record_list)
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ QxLXhE1fxc4 ┆ 2025-08-31T08:00:23Z ┆ uv: The Fastest Way to Install… │
│ enBm0jLXLZ4 ┆ 2025-08-24T08:00:44Z ┆ GitHub for AI Engineers (begin… │
│ zKHSpwayPBU ┆ 2025-08-17T08:00:59Z ┆ Context Engineering Explained … │
│ hugQUr4VwRA ┆ 2025-08-10T08:01:00Z ┆ How to Solve Problems with AI … │
│ fAFJYbtTsC0 ┆ 2025-07-27T08:00:45Z ┆ Fine-tuning LLMs for Tool Use … │
└─────────────┴──────────────────────┴─────────────────────────────────┘


Extract the Transcript

In [21]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """

    # each line of the transcrip store in the dictionary
    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

In [22]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"

    transcript_text_list.append(transcript_text)

CPU times: user 165 µs, sys: 43 µs, total: 208 µs
Wall time: 214 µs


In [23]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬─────────────────────────────────┬────────────┐
│ video_id    ┆ datetime             ┆ title                           ┆ transcript │
│ ---         ┆ ---                  ┆ ---                             ┆ ---        │
│ str         ┆ str                  ┆ str                             ┆ str        │
╞═════════════╪══════════════════════╪═════════════════════════════════╪════════════╡
│ QxLXhE1fxc4 ┆ 2025-08-31T08:00:23Z ┆ uv: The Fastest Way to Install… ┆ n/a        │
│ enBm0jLXLZ4 ┆ 2025-08-24T08:00:44Z ┆ GitHub for AI Engineers (begin… ┆ n/a        │
│ zKHSpwayPBU ┆ 2025-08-17T08:00:59Z ┆ Context Engineering Explained … ┆ n/a        │
│ hugQUr4VwRA ┆ 2025-08-10T08:01:00Z ┆ How to Solve Problems with AI … ┆ n/a        │
│ fAFJYbtTsC0 ┆ 2025-07-27T08:00:45Z ┆ Fine-tuning LLMs for Tool Use … ┆ n/a        │
└─────────────┴──────────────────────┴─────────────────────────────────┴────────────┘


# Transform Process

check for duplicates

In [24]:
# shape + unique values
print("shape:", df.shape)
print("n unique rows:", df.n_unique())
for j in range(df.shape[1]):
    print("n unique elements (" + df.columns[j] + "):", df[:,j].n_unique())

shape: (151, 4)
n unique rows: 151
n unique elements (video_id): 151
n unique elements (datetime): 151
n unique elements (title): 151
n unique elements (transcript): 1


Handling special characters

In [25]:
# this special characters are special for videos or the youtuber
# for general purpose, we do not know special characters of videos. so i do not need this part.

# Load Process

In [26]:
import os
os.makedirs('data', exist_ok=True)

df.write_parquet("data/video-transcripts.parquet")

df = pl.read_parquet("data/video-transcripts.parquet")


Titles and Transcript

In [27]:
print("Total number of title characters:", sum(len(df['title'][i]) for i in range(len(df))))
print("Total number of transcript characters:", sum(len(df['transcript'][i]) for i in range(len(df))))

Total number of title characters: 7908
Total number of transcript characters: 453
